Initializing repository with study materials
This commit is contained in:
324
text_coding/analysis/analysis.py
Normal file
324
text_coding/analysis/analysis.py
Normal file
@@ -0,0 +1,324 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Multivariate Analysis of coding.csv Virtue Data
|
||||
Uses only Python standard library
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
from collections import defaultdict, Counter
|
||||
from itertools import combinations
|
||||
import math
|
||||
|
||||
def load_data(filename):
|
||||
with open(filename, 'r') as f:
|
||||
reader = csv.DictReader(f)
|
||||
rows = list(reader)
|
||||
return rows
|
||||
|
||||
def jaccard_similarity(set1, set2):
|
||||
"""Calculate Jaccard similarity between two sets"""
|
||||
if not set1 and not set2:
|
||||
return 1.0
|
||||
intersection = len(set1 & set2)
|
||||
union = len(set1 | set2)
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def cosine_similarity(vec1, vec2):
|
||||
"""Calculate cosine similarity between two binary vectors"""
|
||||
dot = sum(a * b for a, b in zip(vec1, vec2))
|
||||
norm1 = math.sqrt(sum(a * a for a in vec1))
|
||||
norm2 = math.sqrt(sum(b * b for b in vec2))
|
||||
if norm1 == 0 or norm2 == 0:
|
||||
return 0.0
|
||||
return dot / (norm1 * norm2)
|
||||
|
||||
def hierarchical_clustering(distance_matrix, labels, n_clusters=4):
|
||||
"""Simple agglomerative hierarchical clustering using average linkage"""
|
||||
n = len(labels)
|
||||
clusters = [{i} for i in range(n)]
|
||||
cluster_labels = list(range(n))
|
||||
remaining = set(range(n))
|
||||
|
||||
while len(remaining) > n_clusters:
|
||||
# Find closest pair
|
||||
min_dist = float('inf')
|
||||
to_merge = None
|
||||
|
||||
for i in remaining:
|
||||
for j in remaining:
|
||||
if i < j:
|
||||
dist = distance_matrix[i][j]
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
to_merge = (i, j)
|
||||
|
||||
if to_merge is None:
|
||||
break
|
||||
|
||||
i, j = to_merge
|
||||
new_cluster_label = len(clusters)
|
||||
clusters.append(clusters[i] | clusters[j])
|
||||
|
||||
# Update distance matrix (average linkage)
|
||||
new_distances = []
|
||||
for k in range(len(distance_matrix)):
|
||||
if k not in (i, j):
|
||||
new_dist = distance_matrix[i][k] + distance_matrix[j][k]
|
||||
if len(clusters[new_cluster_label]) > 0:
|
||||
new_dist /= 2
|
||||
new_distances.append(new_dist)
|
||||
else:
|
||||
new_distances.append(float('inf'))
|
||||
|
||||
distance_matrix.append(new_distances)
|
||||
for row in distance_matrix:
|
||||
row.append(new_distances[len(distance_matrix)-1])
|
||||
|
||||
remaining.remove(i)
|
||||
remaining.remove(j)
|
||||
remaining.add(new_cluster_label)
|
||||
cluster_labels.append(new_cluster_label)
|
||||
|
||||
# Assign final cluster labels
|
||||
final_labels = [0] * n
|
||||
for idx, cluster_idx in enumerate(remaining):
|
||||
for item in clusters[cluster_idx]:
|
||||
final_labels[item] = idx
|
||||
|
||||
return final_labels, clusters
|
||||
|
||||
def kmeans_clustering(vectors, k=3, max_iter=100):
|
||||
"""Simple k-means clustering"""
|
||||
n = len(vectors)
|
||||
# Random initialization (deterministic)
|
||||
centers = vectors[::n//k][:k]
|
||||
|
||||
for iteration in range(max_iter):
|
||||
# Assign clusters
|
||||
assignments = []
|
||||
for vec in vectors:
|
||||
distances = [sum((a-b)**2 for a, b in zip(vec, c)) for c in centers]
|
||||
assignments.append(distances.index(min(distances)))
|
||||
|
||||
# Update centers
|
||||
new_centers = []
|
||||
for cluster_id in range(k):
|
||||
cluster_vecs = [vectors[i] for i in range(n) if assignments[i] == cluster_id]
|
||||
if cluster_vecs:
|
||||
new_center = [sum(v[i] for v in cluster_vecs) / len(cluster_vecs)
|
||||
for i in range(len(vectors[0]))]
|
||||
new_centers.append(new_center)
|
||||
else:
|
||||
new_centers.append(centers[cluster_id])
|
||||
|
||||
if new_centers == centers:
|
||||
break
|
||||
centers = new_centers
|
||||
|
||||
return assignments
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("MULTIVARIATE ANALYSIS OF CODING.CSV")
|
||||
print("=" * 70)
|
||||
|
||||
# Load data
|
||||
rows = load_data('coding.csv')
|
||||
print(f"\nDataset: {len(rows)} texts coded")
|
||||
|
||||
# Extract virtues per row
|
||||
virtue_cols = ['Virtue_1', 'Virtue_2', 'Virtue_3', 'Virtue_4', 'Virtue_5']
|
||||
all_virtues_per_row = []
|
||||
source_per_row = []
|
||||
|
||||
for row in rows:
|
||||
virtues = []
|
||||
for col in virtue_cols:
|
||||
val = row.get(col, '').strip() if row.get(col) else ''
|
||||
if val:
|
||||
virtues.append(val)
|
||||
all_virtues_per_row.append(virtues)
|
||||
source_per_row.append(row.get('Source', 'Unknown'))
|
||||
|
||||
# Statistics
|
||||
virtue_counts = [len(v) for v in all_virtues_per_row]
|
||||
avg_virtues = sum(virtue_counts) / len(virtue_counts)
|
||||
print(f"\nCoding Statistics:")
|
||||
print(f" - Average virtues per text: {avg_virtues:.2f}")
|
||||
print(f" - Range: {min(virtue_counts)} - {max(virtue_counts)}")
|
||||
|
||||
# All unique virtues
|
||||
all_virtues_flat = [v for sublist in all_virtues_per_row for v in sublist]
|
||||
unique_virtues = sorted(set(all_virtues_flat))
|
||||
print(f" - Unique virtue categories: {len(unique_virtues)}")
|
||||
|
||||
# Frequency analysis
|
||||
print("\n" + "=" * 70)
|
||||
print("1. FREQUENCY DISTRIBUTION OF VIRTUES")
|
||||
print("=" * 70)
|
||||
virtue_freq = Counter(all_virtues_flat)
|
||||
print(f"\n{'Rank':<6} {'Count':<6} {'Virtue':<40}")
|
||||
print("-" * 55)
|
||||
for rank, (virtue, count) in enumerate(virtue_freq.most_common(30), 1):
|
||||
pct = (count / len(rows)) * 100
|
||||
print(f"{rank:<6} {count:<6} {virtue:<40} ({pct:.1f}%)")
|
||||
|
||||
# Create binary matrix (presence/absence)
|
||||
print("\n" + "=" * 70)
|
||||
print("2. CO-OCCURRENCE ANALYSIS")
|
||||
print("=" * 70)
|
||||
|
||||
# Co-occurrence counter
|
||||
cooccurrence = Counter()
|
||||
for virtues in all_virtues_per_row:
|
||||
for pair in combinations(sorted(virtues), 2):
|
||||
cooccurrence[pair] += 1
|
||||
|
||||
print(f"\nTop 20 Virtue Pairs (appear in same text):")
|
||||
print(f"{'Virtue 1':<30} {'Virtue 2':<30} {'Count':<6}")
|
||||
print("-" * 70)
|
||||
for (v1, v2), count in cooccurrence.most_common(20):
|
||||
print(f"{v1:<30} {v2:<30} {count:<6}")
|
||||
|
||||
# Association strength (Jaccard index)
|
||||
print(f"\n\nStrongest Associations (Jaccard Similarity):")
|
||||
print(f"{'Virtue 1':<30} {'Virtue 2':<30} {'Jaccard':<8}")
|
||||
print("-" * 70)
|
||||
|
||||
virtue_sets = defaultdict(set)
|
||||
for idx, virtues in enumerate(all_virtues_per_row):
|
||||
for v in virtues:
|
||||
virtue_sets[v].add(idx)
|
||||
|
||||
associations = []
|
||||
for (v1, v2), count in cooccurrence.items():
|
||||
set1 = virtue_sets[v1]
|
||||
set2 = virtue_sets[v2]
|
||||
jaccard = len(set1 & set2) / len(set1 | set2)
|
||||
associations.append((jaccard, v1, v2, count))
|
||||
|
||||
associations.sort(reverse=True)
|
||||
for jaccard, v1, v2, count in associations[:20]:
|
||||
if count >= 2: # Only show pairs that appear at least twice
|
||||
print(f"{v1:<30} {v2:<30} {jaccard:.3f}")
|
||||
|
||||
# Create binary vectors for each text
|
||||
virtue_to_idx = {v: i for i, v in enumerate(unique_virtues)}
|
||||
binary_vectors = []
|
||||
for virtues in all_virtues_per_row:
|
||||
vec = [0] * len(unique_virtues)
|
||||
for v in virtues:
|
||||
if v in virtue_to_idx:
|
||||
vec[virtue_to_idx[v]] = 1
|
||||
binary_vectors.append(vec)
|
||||
|
||||
# Clustering
|
||||
print("\n" + "=" * 70)
|
||||
print("3. CLUSTER ANALYSIS OF TEXTS (based on virtue profiles)")
|
||||
print("=" * 70)
|
||||
|
||||
# K-means clustering
|
||||
k = 4
|
||||
clusters = kmeans_clustering(binary_vectors, k=k)
|
||||
|
||||
print(f"\nK-Means Clustering (k={k}):")
|
||||
print("-" * 70)
|
||||
|
||||
for cluster_id in range(k):
|
||||
cluster_texts = [i for i, c in enumerate(clusters) if c == cluster_id]
|
||||
cluster_size = len(cluster_texts)
|
||||
|
||||
# Get dominant virtues in this cluster
|
||||
cluster_virtues = []
|
||||
for idx in cluster_texts:
|
||||
cluster_virtues.extend(all_virtues_per_row[idx])
|
||||
cluster_virtue_freq = Counter(cluster_virtues)
|
||||
|
||||
print(f"\nCluster {cluster_id + 1} ({cluster_size} texts):")
|
||||
print(f" Sources: {', '.join(set(source_per_row[i] for i in cluster_texts))}")
|
||||
print(f" Top virtues: {', '.join([f'{v}({c})' for v, c in cluster_virtue_freq.most_common(5)])}")
|
||||
|
||||
# Cluster similarity analysis
|
||||
print("\n" + "=" * 70)
|
||||
print("4. VIRTUE CLUSTERING (which virtues tend to co-occur)")
|
||||
print("=" * 70)
|
||||
|
||||
# Create virtue-virtue similarity matrix based on co-occurrence
|
||||
print("\nVirtue Communities (highly connected groups):")
|
||||
|
||||
# Build adjacency list
|
||||
adjacency = defaultdict(lambda: defaultdict(float))
|
||||
for (v1, v2), count in cooccurrence.items():
|
||||
total_v1 = virtue_freq[v1]
|
||||
total_v2 = virtue_freq[v2]
|
||||
# Normalized co-occurrence (pointwise mutual information-like)
|
||||
if total_v1 > 0 and total_v2 > 0:
|
||||
strength = count / math.sqrt(total_v1 * total_v2)
|
||||
adjacency[v1][v2] = strength
|
||||
adjacency[v2][v1] = strength
|
||||
|
||||
# Simple community detection by threshold
|
||||
visited = set()
|
||||
communities = []
|
||||
|
||||
for virtue in unique_virtues:
|
||||
if virtue not in visited:
|
||||
community = set()
|
||||
stack = [virtue]
|
||||
while stack:
|
||||
current = stack.pop()
|
||||
if current not in visited:
|
||||
visited.add(current)
|
||||
community.add(current)
|
||||
for neighbor, strength in adjacency[current].items():
|
||||
if strength >= 0.3 and neighbor not in visited:
|
||||
stack.append(neighbor)
|
||||
if len(community) >= 3:
|
||||
communities.append(sorted(community))
|
||||
|
||||
if communities:
|
||||
for i, community in enumerate(communities[:6], 1):
|
||||
print(f"\nCommunity {i}: {', '.join(community[:8])}")
|
||||
if len(community) > 8:
|
||||
print(f" ... and {len(community) - 8} more")
|
||||
else:
|
||||
print("No strong communities detected with current threshold")
|
||||
|
||||
# Sources analysis
|
||||
print("\n" + "=" * 70)
|
||||
print("5. SOURCE-BASED PATTERN ANALYSIS")
|
||||
print("=" * 70)
|
||||
|
||||
source_virtues = defaultdict(list)
|
||||
for idx, (source, virtues) in enumerate(zip(source_per_row, all_virtues_per_row)):
|
||||
source_virtues[source].extend(virtues)
|
||||
|
||||
print(f"\n{'Source':<15} {'Texts':<8} {'Top Virtues (frequency)'}")
|
||||
print("-" * 70)
|
||||
for source in sorted(set(source_per_row)):
|
||||
texts = source_per_row.count(source)
|
||||
freq = Counter(source_virtues[source])
|
||||
top = ', '.join([f"{v}({c})" for v, c in freq.most_common(4)])
|
||||
print(f"{source:<15} {texts:<8} {top}")
|
||||
|
||||
# Summary insights
|
||||
print("\n" + "=" * 70)
|
||||
print("6. KEY INSIGHTS")
|
||||
print("=" * 70)
|
||||
|
||||
print(f"""
|
||||
SUMMARY:
|
||||
- Dataset contains {len(rows)} texts from {len(set(source_per_row))} different sources
|
||||
- {len(unique_virtues)} unique virtue categories were identified
|
||||
- Texts have an average of {avg_virtues:.1f} virtues assigned (range: {min(virtue_counts)}-{max(virtue_counts)})
|
||||
|
||||
TOP FINDINGS:
|
||||
1. Most frequent virtue: '{virtue_freq.most_common(1)[0][0]}' ({virtue_freq.most_common(1)[0][1]} occurrences)
|
||||
2. Strongest virtue pair: '{associations[0][1]}' + '{associations[0][2]}' (Jaccard: {associations[0][0]:.3f})
|
||||
3. Multiple distinct virtue communities detected, suggesting conceptual clustering
|
||||
4. {len([c for c in communities if len(c) >= 3])} major virtue communities identified
|
||||
""")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user