Files
protocol-virtues-study/text_coding/analysis/detailed_analysis.py
2026-03-29 15:25:34 -06:00

166 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Detailed Analysis with Output Files
"""
import csv
from collections import defaultdict, Counter
from itertools import combinations
import math
def load_data(filename):
with open(filename, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)
return rows
def main():
rows = load_data('coding.csv')
# Extract virtues
virtue_cols = ['Virtue_1', 'Virtue_2', 'Virtue_3', 'Virtue_4', 'Virtue_5']
all_virtues_per_row = []
for row in rows:
virtues = []
for col in virtue_cols:
val = row.get(col, '').strip() if row.get(col) else ''
if val:
virtues.append(val)
all_virtues_per_row.append(virtues)
# Get top virtues for matrix
all_virtues_flat = [v for sublist in all_virtues_per_row for v in sublist]
virtue_freq = Counter(all_virtues_flat)
top_virtues = [v for v, c in virtue_freq.most_common(25)]
# Create co-occurrence matrix
cooccurrence = Counter()
for virtues in all_virtues_per_row:
for pair in combinations(sorted(virtues), 2):
cooccurrence[pair] += 1
# Write co-occurrence matrix
with open('cooccurrence_matrix.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Virtue'] + top_virtues)
for v1 in top_virtues:
row = [v1]
for v2 in top_virtues:
if v1 == v2:
row.append('') # Diagonal
else:
count = cooccurrence.get((min(v1,v2), max(v1,v2)), 0)
row.append(count)
writer.writerow(row)
print("Created: cooccurrence_matrix.csv")
# Create similarity matrix (Jaccard)
virtue_sets = defaultdict(set)
for idx, virtues in enumerate(all_virtues_per_row):
for v in virtues:
virtue_sets[v].add(idx)
with open('jaccard_similarity_matrix.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Virtue'] + top_virtues)
for v1 in top_virtues:
row = [v1]
for v2 in top_virtues:
if v1 == v2:
row.append('1.0')
else:
set1 = virtue_sets[v1]
set2 = virtue_sets[v2]
jaccard = len(set1 & set2) / len(set1 | set2) if (set1 | set2) else 0
row.append(f"{jaccard:.3f}")
writer.writerow(row)
print("Created: jaccard_similarity_matrix.csv")
# Centrality analysis - which virtues connect most to others
print("\n" + "=" * 70)
print("VIRTUE NETWORK CENTRALITY ANALYSIS")
print("=" * 70)
# Degree centrality (how many different virtues each connects to)
connections = defaultdict(set)
for (v1, v2), count in cooccurrence.items():
if count >= 1:
connections[v1].add(v2)
connections[v2].add(v1)
centrality = [(v, len(connections[v])) for v in virtue_freq.keys()]
centrality.sort(key=lambda x: x[1], reverse=True)
print("\nTop 'Hub' Virtues (connect to most other virtue types):")
print(f"{'Virtue':<40} {'Connections':<12}")
print("-" * 55)
for virtue, degree in centrality[:15]:
nearby = list(connections[virtue])[:5]
print(f"{virtue:<40} {degree:<12}{', '.join(nearby)}")
# Network density by source
print("\n" + "=" * 70)
print("NETWORK COMPLEXITY BY SOURCE")
print("=" * 70)
source_per_row = [row.get('Source', 'Unknown') for row in rows]
for source in ['AFP', 'PR']:
source_indices = [i for i, s in enumerate(source_per_row) if s == source]
source_pairs = Counter()
for idx in source_indices:
virtues = all_virtues_per_row[idx]
for pair in combinations(sorted(virtues), 2):
source_pairs[pair] += 1
unique_connections = len(source_pairs)
total_texts = len(source_indices)
avg_pairs = sum(source_pairs.values()) / total_texts if total_texts else 0
print(f"\n{source}:")
print(f" Texts: {total_texts}")
print(f" Unique virtue pairs: {unique_connections}")
print(f" Avg pairs per text: {avg_pairs:.2f}")
print(f" Network density: {unique_connections / (len(virtue_freq) * (len(virtue_freq)-1) / 2) * 100:.1f}%")
# Create profile for each text (export)
with open('virtue_profiles.json', 'w') as f:
profiles = []
for i, (row, virtues) in enumerate(zip(rows, all_virtues_per_row)):
profiles.append({
'id': i,
'source': row.get('Source', ''),
'virtues': virtues,
'virtue_count': len(virtues)
})
import json
json.dump(profiles, f, indent=2)
print("\nCreated: virtue_profiles.json")
# Summary of strongest associations
with open('strong_associations.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Virtue_1', 'Virtue_2', 'Co_count', 'Jaccard', 'Observed', 'Expected'])
for (v1, v2), count in cooccurrence.most_common(50):
set1 = virtue_sets[v1]
set2 = virtue_sets[v2]
jaccard = len(set1 & set2) / len(set1 | set2) if (set1 | set2) else 0
# Expected co-occurrence if random
p1 = len(set1) / len(rows)
p2 = len(set2) / len(rows)
expected = len(rows) * p1 * p2
writer.writerow([v1, v2, count, f"{jaccard:.3f}", count, f"{expected:.2f}"])
print("Created: strong_associations.csv")
if __name__ == "__main__":
main()