Initializing repository with study materials
This commit is contained in:
165
text_coding/analysis/detailed_analysis.py
Normal file
165
text_coding/analysis/detailed_analysis.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Detailed Analysis with Output Files
|
||||
"""
|
||||
|
||||
import csv
|
||||
from collections import defaultdict, Counter
|
||||
from itertools import combinations
|
||||
import math
|
||||
|
||||
def load_data(filename):
|
||||
with open(filename, 'r') as f:
|
||||
reader = csv.DictReader(f)
|
||||
rows = list(reader)
|
||||
return rows
|
||||
|
||||
def main():
|
||||
rows = load_data('coding.csv')
|
||||
|
||||
# Extract virtues
|
||||
virtue_cols = ['Virtue_1', 'Virtue_2', 'Virtue_3', 'Virtue_4', 'Virtue_5']
|
||||
all_virtues_per_row = []
|
||||
|
||||
for row in rows:
|
||||
virtues = []
|
||||
for col in virtue_cols:
|
||||
val = row.get(col, '').strip() if row.get(col) else ''
|
||||
if val:
|
||||
virtues.append(val)
|
||||
all_virtues_per_row.append(virtues)
|
||||
|
||||
# Get top virtues for matrix
|
||||
all_virtues_flat = [v for sublist in all_virtues_per_row for v in sublist]
|
||||
virtue_freq = Counter(all_virtues_flat)
|
||||
top_virtues = [v for v, c in virtue_freq.most_common(25)]
|
||||
|
||||
# Create co-occurrence matrix
|
||||
cooccurrence = Counter()
|
||||
for virtues in all_virtues_per_row:
|
||||
for pair in combinations(sorted(virtues), 2):
|
||||
cooccurrence[pair] += 1
|
||||
|
||||
# Write co-occurrence matrix
|
||||
with open('cooccurrence_matrix.csv', 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Virtue'] + top_virtues)
|
||||
for v1 in top_virtues:
|
||||
row = [v1]
|
||||
for v2 in top_virtues:
|
||||
if v1 == v2:
|
||||
row.append('') # Diagonal
|
||||
else:
|
||||
count = cooccurrence.get((min(v1,v2), max(v1,v2)), 0)
|
||||
row.append(count)
|
||||
writer.writerow(row)
|
||||
|
||||
print("Created: cooccurrence_matrix.csv")
|
||||
|
||||
# Create similarity matrix (Jaccard)
|
||||
virtue_sets = defaultdict(set)
|
||||
for idx, virtues in enumerate(all_virtues_per_row):
|
||||
for v in virtues:
|
||||
virtue_sets[v].add(idx)
|
||||
|
||||
with open('jaccard_similarity_matrix.csv', 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Virtue'] + top_virtues)
|
||||
for v1 in top_virtues:
|
||||
row = [v1]
|
||||
for v2 in top_virtues:
|
||||
if v1 == v2:
|
||||
row.append('1.0')
|
||||
else:
|
||||
set1 = virtue_sets[v1]
|
||||
set2 = virtue_sets[v2]
|
||||
jaccard = len(set1 & set2) / len(set1 | set2) if (set1 | set2) else 0
|
||||
row.append(f"{jaccard:.3f}")
|
||||
writer.writerow(row)
|
||||
|
||||
print("Created: jaccard_similarity_matrix.csv")
|
||||
|
||||
# Centrality analysis - which virtues connect most to others
|
||||
print("\n" + "=" * 70)
|
||||
print("VIRTUE NETWORK CENTRALITY ANALYSIS")
|
||||
print("=" * 70)
|
||||
|
||||
# Degree centrality (how many different virtues each connects to)
|
||||
connections = defaultdict(set)
|
||||
for (v1, v2), count in cooccurrence.items():
|
||||
if count >= 1:
|
||||
connections[v1].add(v2)
|
||||
connections[v2].add(v1)
|
||||
|
||||
centrality = [(v, len(connections[v])) for v in virtue_freq.keys()]
|
||||
centrality.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
print("\nTop 'Hub' Virtues (connect to most other virtue types):")
|
||||
print(f"{'Virtue':<40} {'Connections':<12}")
|
||||
print("-" * 55)
|
||||
for virtue, degree in centrality[:15]:
|
||||
nearby = list(connections[virtue])[:5]
|
||||
print(f"{virtue:<40} {degree:<12} → {', '.join(nearby)}")
|
||||
|
||||
# Network density by source
|
||||
print("\n" + "=" * 70)
|
||||
print("NETWORK COMPLEXITY BY SOURCE")
|
||||
print("=" * 70)
|
||||
|
||||
source_per_row = [row.get('Source', 'Unknown') for row in rows]
|
||||
|
||||
for source in ['AFP', 'PR']:
|
||||
source_indices = [i for i, s in enumerate(source_per_row) if s == source]
|
||||
source_pairs = Counter()
|
||||
|
||||
for idx in source_indices:
|
||||
virtues = all_virtues_per_row[idx]
|
||||
for pair in combinations(sorted(virtues), 2):
|
||||
source_pairs[pair] += 1
|
||||
|
||||
unique_connections = len(source_pairs)
|
||||
total_texts = len(source_indices)
|
||||
avg_pairs = sum(source_pairs.values()) / total_texts if total_texts else 0
|
||||
|
||||
print(f"\n{source}:")
|
||||
print(f" Texts: {total_texts}")
|
||||
print(f" Unique virtue pairs: {unique_connections}")
|
||||
print(f" Avg pairs per text: {avg_pairs:.2f}")
|
||||
print(f" Network density: {unique_connections / (len(virtue_freq) * (len(virtue_freq)-1) / 2) * 100:.1f}%")
|
||||
|
||||
# Create profile for each text (export)
|
||||
with open('virtue_profiles.json', 'w') as f:
|
||||
profiles = []
|
||||
for i, (row, virtues) in enumerate(zip(rows, all_virtues_per_row)):
|
||||
profiles.append({
|
||||
'id': i,
|
||||
'source': row.get('Source', ''),
|
||||
'virtues': virtues,
|
||||
'virtue_count': len(virtues)
|
||||
})
|
||||
import json
|
||||
json.dump(profiles, f, indent=2)
|
||||
|
||||
print("\nCreated: virtue_profiles.json")
|
||||
|
||||
# Summary of strongest associations
|
||||
with open('strong_associations.csv', 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Virtue_1', 'Virtue_2', 'Co_count', 'Jaccard', 'Observed', 'Expected'])
|
||||
|
||||
for (v1, v2), count in cooccurrence.most_common(50):
|
||||
set1 = virtue_sets[v1]
|
||||
set2 = virtue_sets[v2]
|
||||
jaccard = len(set1 & set2) / len(set1 | set2) if (set1 | set2) else 0
|
||||
|
||||
# Expected co-occurrence if random
|
||||
p1 = len(set1) / len(rows)
|
||||
p2 = len(set2) / len(rows)
|
||||
expected = len(rows) * p1 * p2
|
||||
|
||||
writer.writerow([v1, v2, count, f"{jaccard:.3f}", count, f"{expected:.2f}"])
|
||||
|
||||
print("Created: strong_associations.csv")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user