Initializing repository with study materials

This commit is contained in:
Nathan Schneider
2026-03-29 15:25:34 -06:00
commit 825d54dda7
21 changed files with 3097 additions and 0 deletions

View File

@@ -0,0 +1,324 @@
#!/usr/bin/env python3
"""
Multivariate Analysis of coding.csv Virtue Data
Uses only Python standard library
"""
import csv
import json
from collections import defaultdict, Counter
from itertools import combinations
import math
def load_data(filename):
with open(filename, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)
return rows
def jaccard_similarity(set1, set2):
"""Calculate Jaccard similarity between two sets"""
if not set1 and not set2:
return 1.0
intersection = len(set1 & set2)
union = len(set1 | set2)
return intersection / union if union > 0 else 0.0
def cosine_similarity(vec1, vec2):
"""Calculate cosine similarity between two binary vectors"""
dot = sum(a * b for a, b in zip(vec1, vec2))
norm1 = math.sqrt(sum(a * a for a in vec1))
norm2 = math.sqrt(sum(b * b for b in vec2))
if norm1 == 0 or norm2 == 0:
return 0.0
return dot / (norm1 * norm2)
def hierarchical_clustering(distance_matrix, labels, n_clusters=4):
"""Simple agglomerative hierarchical clustering using average linkage"""
n = len(labels)
clusters = [{i} for i in range(n)]
cluster_labels = list(range(n))
remaining = set(range(n))
while len(remaining) > n_clusters:
# Find closest pair
min_dist = float('inf')
to_merge = None
for i in remaining:
for j in remaining:
if i < j:
dist = distance_matrix[i][j]
if dist < min_dist:
min_dist = dist
to_merge = (i, j)
if to_merge is None:
break
i, j = to_merge
new_cluster_label = len(clusters)
clusters.append(clusters[i] | clusters[j])
# Update distance matrix (average linkage)
new_distances = []
for k in range(len(distance_matrix)):
if k not in (i, j):
new_dist = distance_matrix[i][k] + distance_matrix[j][k]
if len(clusters[new_cluster_label]) > 0:
new_dist /= 2
new_distances.append(new_dist)
else:
new_distances.append(float('inf'))
distance_matrix.append(new_distances)
for row in distance_matrix:
row.append(new_distances[len(distance_matrix)-1])
remaining.remove(i)
remaining.remove(j)
remaining.add(new_cluster_label)
cluster_labels.append(new_cluster_label)
# Assign final cluster labels
final_labels = [0] * n
for idx, cluster_idx in enumerate(remaining):
for item in clusters[cluster_idx]:
final_labels[item] = idx
return final_labels, clusters
def kmeans_clustering(vectors, k=3, max_iter=100):
"""Simple k-means clustering"""
n = len(vectors)
# Random initialization (deterministic)
centers = vectors[::n//k][:k]
for iteration in range(max_iter):
# Assign clusters
assignments = []
for vec in vectors:
distances = [sum((a-b)**2 for a, b in zip(vec, c)) for c in centers]
assignments.append(distances.index(min(distances)))
# Update centers
new_centers = []
for cluster_id in range(k):
cluster_vecs = [vectors[i] for i in range(n) if assignments[i] == cluster_id]
if cluster_vecs:
new_center = [sum(v[i] for v in cluster_vecs) / len(cluster_vecs)
for i in range(len(vectors[0]))]
new_centers.append(new_center)
else:
new_centers.append(centers[cluster_id])
if new_centers == centers:
break
centers = new_centers
return assignments
def main():
print("=" * 70)
print("MULTIVARIATE ANALYSIS OF CODING.CSV")
print("=" * 70)
# Load data
rows = load_data('coding.csv')
print(f"\nDataset: {len(rows)} texts coded")
# Extract virtues per row
virtue_cols = ['Virtue_1', 'Virtue_2', 'Virtue_3', 'Virtue_4', 'Virtue_5']
all_virtues_per_row = []
source_per_row = []
for row in rows:
virtues = []
for col in virtue_cols:
val = row.get(col, '').strip() if row.get(col) else ''
if val:
virtues.append(val)
all_virtues_per_row.append(virtues)
source_per_row.append(row.get('Source', 'Unknown'))
# Statistics
virtue_counts = [len(v) for v in all_virtues_per_row]
avg_virtues = sum(virtue_counts) / len(virtue_counts)
print(f"\nCoding Statistics:")
print(f" - Average virtues per text: {avg_virtues:.2f}")
print(f" - Range: {min(virtue_counts)} - {max(virtue_counts)}")
# All unique virtues
all_virtues_flat = [v for sublist in all_virtues_per_row for v in sublist]
unique_virtues = sorted(set(all_virtues_flat))
print(f" - Unique virtue categories: {len(unique_virtues)}")
# Frequency analysis
print("\n" + "=" * 70)
print("1. FREQUENCY DISTRIBUTION OF VIRTUES")
print("=" * 70)
virtue_freq = Counter(all_virtues_flat)
print(f"\n{'Rank':<6} {'Count':<6} {'Virtue':<40}")
print("-" * 55)
for rank, (virtue, count) in enumerate(virtue_freq.most_common(30), 1):
pct = (count / len(rows)) * 100
print(f"{rank:<6} {count:<6} {virtue:<40} ({pct:.1f}%)")
# Create binary matrix (presence/absence)
print("\n" + "=" * 70)
print("2. CO-OCCURRENCE ANALYSIS")
print("=" * 70)
# Co-occurrence counter
cooccurrence = Counter()
for virtues in all_virtues_per_row:
for pair in combinations(sorted(virtues), 2):
cooccurrence[pair] += 1
print(f"\nTop 20 Virtue Pairs (appear in same text):")
print(f"{'Virtue 1':<30} {'Virtue 2':<30} {'Count':<6}")
print("-" * 70)
for (v1, v2), count in cooccurrence.most_common(20):
print(f"{v1:<30} {v2:<30} {count:<6}")
# Association strength (Jaccard index)
print(f"\n\nStrongest Associations (Jaccard Similarity):")
print(f"{'Virtue 1':<30} {'Virtue 2':<30} {'Jaccard':<8}")
print("-" * 70)
virtue_sets = defaultdict(set)
for idx, virtues in enumerate(all_virtues_per_row):
for v in virtues:
virtue_sets[v].add(idx)
associations = []
for (v1, v2), count in cooccurrence.items():
set1 = virtue_sets[v1]
set2 = virtue_sets[v2]
jaccard = len(set1 & set2) / len(set1 | set2)
associations.append((jaccard, v1, v2, count))
associations.sort(reverse=True)
for jaccard, v1, v2, count in associations[:20]:
if count >= 2: # Only show pairs that appear at least twice
print(f"{v1:<30} {v2:<30} {jaccard:.3f}")
# Create binary vectors for each text
virtue_to_idx = {v: i for i, v in enumerate(unique_virtues)}
binary_vectors = []
for virtues in all_virtues_per_row:
vec = [0] * len(unique_virtues)
for v in virtues:
if v in virtue_to_idx:
vec[virtue_to_idx[v]] = 1
binary_vectors.append(vec)
# Clustering
print("\n" + "=" * 70)
print("3. CLUSTER ANALYSIS OF TEXTS (based on virtue profiles)")
print("=" * 70)
# K-means clustering
k = 4
clusters = kmeans_clustering(binary_vectors, k=k)
print(f"\nK-Means Clustering (k={k}):")
print("-" * 70)
for cluster_id in range(k):
cluster_texts = [i for i, c in enumerate(clusters) if c == cluster_id]
cluster_size = len(cluster_texts)
# Get dominant virtues in this cluster
cluster_virtues = []
for idx in cluster_texts:
cluster_virtues.extend(all_virtues_per_row[idx])
cluster_virtue_freq = Counter(cluster_virtues)
print(f"\nCluster {cluster_id + 1} ({cluster_size} texts):")
print(f" Sources: {', '.join(set(source_per_row[i] for i in cluster_texts))}")
print(f" Top virtues: {', '.join([f'{v}({c})' for v, c in cluster_virtue_freq.most_common(5)])}")
# Cluster similarity analysis
print("\n" + "=" * 70)
print("4. VIRTUE CLUSTERING (which virtues tend to co-occur)")
print("=" * 70)
# Create virtue-virtue similarity matrix based on co-occurrence
print("\nVirtue Communities (highly connected groups):")
# Build adjacency list
adjacency = defaultdict(lambda: defaultdict(float))
for (v1, v2), count in cooccurrence.items():
total_v1 = virtue_freq[v1]
total_v2 = virtue_freq[v2]
# Normalized co-occurrence (pointwise mutual information-like)
if total_v1 > 0 and total_v2 > 0:
strength = count / math.sqrt(total_v1 * total_v2)
adjacency[v1][v2] = strength
adjacency[v2][v1] = strength
# Simple community detection by threshold
visited = set()
communities = []
for virtue in unique_virtues:
if virtue not in visited:
community = set()
stack = [virtue]
while stack:
current = stack.pop()
if current not in visited:
visited.add(current)
community.add(current)
for neighbor, strength in adjacency[current].items():
if strength >= 0.3 and neighbor not in visited:
stack.append(neighbor)
if len(community) >= 3:
communities.append(sorted(community))
if communities:
for i, community in enumerate(communities[:6], 1):
print(f"\nCommunity {i}: {', '.join(community[:8])}")
if len(community) > 8:
print(f" ... and {len(community) - 8} more")
else:
print("No strong communities detected with current threshold")
# Sources analysis
print("\n" + "=" * 70)
print("5. SOURCE-BASED PATTERN ANALYSIS")
print("=" * 70)
source_virtues = defaultdict(list)
for idx, (source, virtues) in enumerate(zip(source_per_row, all_virtues_per_row)):
source_virtues[source].extend(virtues)
print(f"\n{'Source':<15} {'Texts':<8} {'Top Virtues (frequency)'}")
print("-" * 70)
for source in sorted(set(source_per_row)):
texts = source_per_row.count(source)
freq = Counter(source_virtues[source])
top = ', '.join([f"{v}({c})" for v, c in freq.most_common(4)])
print(f"{source:<15} {texts:<8} {top}")
# Summary insights
print("\n" + "=" * 70)
print("6. KEY INSIGHTS")
print("=" * 70)
print(f"""
SUMMARY:
- Dataset contains {len(rows)} texts from {len(set(source_per_row))} different sources
- {len(unique_virtues)} unique virtue categories were identified
- Texts have an average of {avg_virtues:.1f} virtues assigned (range: {min(virtue_counts)}-{max(virtue_counts)})
TOP FINDINGS:
1. Most frequent virtue: '{virtue_freq.most_common(1)[0][0]}' ({virtue_freq.most_common(1)[0][1]} occurrences)
2. Strongest virtue pair: '{associations[0][1]}' + '{associations[0][2]}' (Jaccard: {associations[0][0]:.3f})
3. Multiple distinct virtue communities detected, suggesting conceptual clustering
4. {len([c for c in communities if len(c) >= 3])} major virtue communities identified
""")
if __name__ == "__main__":
main()