#!/usr/bin/env python3 """ Multivariate Analysis of coding.csv Virtue Data Uses only Python standard library """ import csv import json from collections import defaultdict, Counter from itertools import combinations import math def load_data(filename): with open(filename, 'r') as f: reader = csv.DictReader(f) rows = list(reader) return rows def jaccard_similarity(set1, set2): """Calculate Jaccard similarity between two sets""" if not set1 and not set2: return 1.0 intersection = len(set1 & set2) union = len(set1 | set2) return intersection / union if union > 0 else 0.0 def cosine_similarity(vec1, vec2): """Calculate cosine similarity between two binary vectors""" dot = sum(a * b for a, b in zip(vec1, vec2)) norm1 = math.sqrt(sum(a * a for a in vec1)) norm2 = math.sqrt(sum(b * b for b in vec2)) if norm1 == 0 or norm2 == 0: return 0.0 return dot / (norm1 * norm2) def hierarchical_clustering(distance_matrix, labels, n_clusters=4): """Simple agglomerative hierarchical clustering using average linkage""" n = len(labels) clusters = [{i} for i in range(n)] cluster_labels = list(range(n)) remaining = set(range(n)) while len(remaining) > n_clusters: # Find closest pair min_dist = float('inf') to_merge = None for i in remaining: for j in remaining: if i < j: dist = distance_matrix[i][j] if dist < min_dist: min_dist = dist to_merge = (i, j) if to_merge is None: break i, j = to_merge new_cluster_label = len(clusters) clusters.append(clusters[i] | clusters[j]) # Update distance matrix (average linkage) new_distances = [] for k in range(len(distance_matrix)): if k not in (i, j): new_dist = distance_matrix[i][k] + distance_matrix[j][k] if len(clusters[new_cluster_label]) > 0: new_dist /= 2 new_distances.append(new_dist) else: new_distances.append(float('inf')) distance_matrix.append(new_distances) for row in distance_matrix: row.append(new_distances[len(distance_matrix)-1]) remaining.remove(i) remaining.remove(j) remaining.add(new_cluster_label) cluster_labels.append(new_cluster_label) # Assign final cluster labels final_labels = [0] * n for idx, cluster_idx in enumerate(remaining): for item in clusters[cluster_idx]: final_labels[item] = idx return final_labels, clusters def kmeans_clustering(vectors, k=3, max_iter=100): """Simple k-means clustering""" n = len(vectors) # Random initialization (deterministic) centers = vectors[::n//k][:k] for iteration in range(max_iter): # Assign clusters assignments = [] for vec in vectors: distances = [sum((a-b)**2 for a, b in zip(vec, c)) for c in centers] assignments.append(distances.index(min(distances))) # Update centers new_centers = [] for cluster_id in range(k): cluster_vecs = [vectors[i] for i in range(n) if assignments[i] == cluster_id] if cluster_vecs: new_center = [sum(v[i] for v in cluster_vecs) / len(cluster_vecs) for i in range(len(vectors[0]))] new_centers.append(new_center) else: new_centers.append(centers[cluster_id]) if new_centers == centers: break centers = new_centers return assignments def main(): print("=" * 70) print("MULTIVARIATE ANALYSIS OF CODING.CSV") print("=" * 70) # Load data rows = load_data('coding.csv') print(f"\nDataset: {len(rows)} texts coded") # Extract virtues per row virtue_cols = ['Virtue_1', 'Virtue_2', 'Virtue_3', 'Virtue_4', 'Virtue_5'] all_virtues_per_row = [] source_per_row = [] for row in rows: virtues = [] for col in virtue_cols: val = row.get(col, '').strip() if row.get(col) else '' if val: virtues.append(val) all_virtues_per_row.append(virtues) source_per_row.append(row.get('Source', 'Unknown')) # Statistics virtue_counts = [len(v) for v in all_virtues_per_row] avg_virtues = sum(virtue_counts) / len(virtue_counts) print(f"\nCoding Statistics:") print(f" - Average virtues per text: {avg_virtues:.2f}") print(f" - Range: {min(virtue_counts)} - {max(virtue_counts)}") # All unique virtues all_virtues_flat = [v for sublist in all_virtues_per_row for v in sublist] unique_virtues = sorted(set(all_virtues_flat)) print(f" - Unique virtue categories: {len(unique_virtues)}") # Frequency analysis print("\n" + "=" * 70) print("1. FREQUENCY DISTRIBUTION OF VIRTUES") print("=" * 70) virtue_freq = Counter(all_virtues_flat) print(f"\n{'Rank':<6} {'Count':<6} {'Virtue':<40}") print("-" * 55) for rank, (virtue, count) in enumerate(virtue_freq.most_common(30), 1): pct = (count / len(rows)) * 100 print(f"{rank:<6} {count:<6} {virtue:<40} ({pct:.1f}%)") # Create binary matrix (presence/absence) print("\n" + "=" * 70) print("2. CO-OCCURRENCE ANALYSIS") print("=" * 70) # Co-occurrence counter cooccurrence = Counter() for virtues in all_virtues_per_row: for pair in combinations(sorted(virtues), 2): cooccurrence[pair] += 1 print(f"\nTop 20 Virtue Pairs (appear in same text):") print(f"{'Virtue 1':<30} {'Virtue 2':<30} {'Count':<6}") print("-" * 70) for (v1, v2), count in cooccurrence.most_common(20): print(f"{v1:<30} {v2:<30} {count:<6}") # Association strength (Jaccard index) print(f"\n\nStrongest Associations (Jaccard Similarity):") print(f"{'Virtue 1':<30} {'Virtue 2':<30} {'Jaccard':<8}") print("-" * 70) virtue_sets = defaultdict(set) for idx, virtues in enumerate(all_virtues_per_row): for v in virtues: virtue_sets[v].add(idx) associations = [] for (v1, v2), count in cooccurrence.items(): set1 = virtue_sets[v1] set2 = virtue_sets[v2] jaccard = len(set1 & set2) / len(set1 | set2) associations.append((jaccard, v1, v2, count)) associations.sort(reverse=True) for jaccard, v1, v2, count in associations[:20]: if count >= 2: # Only show pairs that appear at least twice print(f"{v1:<30} {v2:<30} {jaccard:.3f}") # Create binary vectors for each text virtue_to_idx = {v: i for i, v in enumerate(unique_virtues)} binary_vectors = [] for virtues in all_virtues_per_row: vec = [0] * len(unique_virtues) for v in virtues: if v in virtue_to_idx: vec[virtue_to_idx[v]] = 1 binary_vectors.append(vec) # Clustering print("\n" + "=" * 70) print("3. CLUSTER ANALYSIS OF TEXTS (based on virtue profiles)") print("=" * 70) # K-means clustering k = 4 clusters = kmeans_clustering(binary_vectors, k=k) print(f"\nK-Means Clustering (k={k}):") print("-" * 70) for cluster_id in range(k): cluster_texts = [i for i, c in enumerate(clusters) if c == cluster_id] cluster_size = len(cluster_texts) # Get dominant virtues in this cluster cluster_virtues = [] for idx in cluster_texts: cluster_virtues.extend(all_virtues_per_row[idx]) cluster_virtue_freq = Counter(cluster_virtues) print(f"\nCluster {cluster_id + 1} ({cluster_size} texts):") print(f" Sources: {', '.join(set(source_per_row[i] for i in cluster_texts))}") print(f" Top virtues: {', '.join([f'{v}({c})' for v, c in cluster_virtue_freq.most_common(5)])}") # Cluster similarity analysis print("\n" + "=" * 70) print("4. VIRTUE CLUSTERING (which virtues tend to co-occur)") print("=" * 70) # Create virtue-virtue similarity matrix based on co-occurrence print("\nVirtue Communities (highly connected groups):") # Build adjacency list adjacency = defaultdict(lambda: defaultdict(float)) for (v1, v2), count in cooccurrence.items(): total_v1 = virtue_freq[v1] total_v2 = virtue_freq[v2] # Normalized co-occurrence (pointwise mutual information-like) if total_v1 > 0 and total_v2 > 0: strength = count / math.sqrt(total_v1 * total_v2) adjacency[v1][v2] = strength adjacency[v2][v1] = strength # Simple community detection by threshold visited = set() communities = [] for virtue in unique_virtues: if virtue not in visited: community = set() stack = [virtue] while stack: current = stack.pop() if current not in visited: visited.add(current) community.add(current) for neighbor, strength in adjacency[current].items(): if strength >= 0.3 and neighbor not in visited: stack.append(neighbor) if len(community) >= 3: communities.append(sorted(community)) if communities: for i, community in enumerate(communities[:6], 1): print(f"\nCommunity {i}: {', '.join(community[:8])}") if len(community) > 8: print(f" ... and {len(community) - 8} more") else: print("No strong communities detected with current threshold") # Sources analysis print("\n" + "=" * 70) print("5. SOURCE-BASED PATTERN ANALYSIS") print("=" * 70) source_virtues = defaultdict(list) for idx, (source, virtues) in enumerate(zip(source_per_row, all_virtues_per_row)): source_virtues[source].extend(virtues) print(f"\n{'Source':<15} {'Texts':<8} {'Top Virtues (frequency)'}") print("-" * 70) for source in sorted(set(source_per_row)): texts = source_per_row.count(source) freq = Counter(source_virtues[source]) top = ', '.join([f"{v}({c})" for v, c in freq.most_common(4)]) print(f"{source:<15} {texts:<8} {top}") # Summary insights print("\n" + "=" * 70) print("6. KEY INSIGHTS") print("=" * 70) print(f""" SUMMARY: - Dataset contains {len(rows)} texts from {len(set(source_per_row))} different sources - {len(unique_virtues)} unique virtue categories were identified - Texts have an average of {avg_virtues:.1f} virtues assigned (range: {min(virtue_counts)}-{max(virtue_counts)}) TOP FINDINGS: 1. Most frequent virtue: '{virtue_freq.most_common(1)[0][0]}' ({virtue_freq.most_common(1)[0][1]} occurrences) 2. Strongest virtue pair: '{associations[0][1]}' + '{associations[0][2]}' (Jaccard: {associations[0][0]:.3f}) 3. Multiple distinct virtue communities detected, suggesting conceptual clustering 4. {len([c for c in communities if len(c) >= 3])} major virtue communities identified """) if __name__ == "__main__": main()