Skip to content

Instantly share code, notes, and snippets.

@Yomguithereal
Created November 14, 2017 15:10
Show Gist options
  • Save Yomguithereal/9809c66afa2437f38f7bd56fdbe3a503 to your computer and use it in GitHub Desktop.
Save Yomguithereal/9809c66afa2437f38f7bd56fdbe3a503 to your computer and use it in GitHub Desktop.
FNAC Artists Sequences Clustering Script
#!/usr/bin/env python3
# Points to improve:
# 1) Distance metrics
# 2) Clustering scheme
import csv
import re
from collections import defaultdict
# Parameters
SOURCE_CSV_PATH = './uniq_artworks.csv'
ARTISTS_SEPARATOR = re.compile(',\s+(?=[A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ])')
TOKEN_BLACK_LIST = set([
'M/M'
])
SEQUENCE_LENGTH_THRESHOLD = 3
DISTANCE_THRESHOLD = 1
# Indices
ARTISTS_ACQUISITION_INDEX = defaultdict(list)
ARTISTS_SEQUENCES = {}
# Helper functions
def substitution_cost(mode1, mode2, artist1, artist2):
return 1
def levenshtein(str1, str2, artist1, artist2):
m = len(str1)
n = len(str2)
lensum = float(m + n)
d = []
for i in range(m+1):
d.append([i])
del d[0][0]
for j in range(n+1):
d[0].append(j)
for j in range(1,n+1):
for i in range(1,m+1):
if str1[i-1] == str2[j-1]:
d[i].insert(j,d[i-1][j-1])
else:
minimum = min(
d[i-1][j] + 1, # Deletion cost
d[i][j-1] + 1, # Insertion cost
d[i-1][j-1] + substitution_cost(str1, str2, artist1, artist2)) # Substition cost
d[i].insert(j, minimum)
ldist = d[-1][-1]
return ldist
# 1) Read the source CSV file
with open(SOURCE_CSV_PATH, 'r') as sf:
reader = csv.DictReader(sf)
for row in reader:
artists = ARTISTS_SEPARATOR.split(row['authors_list'].strip())
artists = (artist for artist in artists if artist not in TOKEN_BLACK_LIST)
year = row['acquisition_year'].strip()
# Skipping no year
if not year:
continue
year = int(year)
# Skipping before period
if year < 1945:
continue
raw_mode = row['acquisition_mode'].strip().lower()
mode = None
if 'commande' in raw_mode:
mode = 'C'
elif 'achat' in raw_mode:
mode = 'A'
elif 'don' in raw_mode:
mode = 'D'
# Skipping useless acquisition mode
if not mode:
continue
# Adding the acquisition
for artist in artists:
ARTISTS_ACQUISITION_INDEX[artist].append((year, mode))
# 2) Compiling sequences
for artist, acquisitions in ARTISTS_ACQUISITION_INDEX.items():
# Filtering tiny sequences
if len(acquisitions) < SEQUENCE_LENGTH_THRESHOLD:
continue
# Ordering acquisitions
acquisitions = sorted(acquisitions)
# Squeezing the sequence
sequence = ''
last_mode = None
for _, mode in acquisitions:
if mode != last_mode:
sequence += mode
last_mode = mode
# Filtering a second time
if len(sequence) < SEQUENCE_LENGTH_THRESHOLD:
continue
ARTISTS_SEQUENCES[artist] = sequence
# 3) Processing the eta-NN graph
GRAPH = defaultdict(list)
ARTISTS = list(ARTISTS_SEQUENCES.keys())
for i, artist_source in enumerate(ARTISTS):
sequence_source = ARTISTS_SEQUENCES[artist_source]
for j in range(i + 1, len(ARTISTS)):
artist_target = ARTISTS[j]
sequence_target = ARTISTS_SEQUENCES[artist_target]
distance = levenshtein(
sequence_source,
sequence_target,
artist_source,
artist_target
)
if distance < DISTANCE_THRESHOLD:
GRAPH[artist_source].append(artist_target)
GRAPH[artist_target].append(artist_source)
# 4) Deriving clusters
CLUSTERS = []
ALREADY_IN_CLUSTER = set()
for artist, neighbors in GRAPH.items():
if artist in ALREADY_IN_CLUSTER:
continue
cluster = [artist] + neighbors
CLUSTERS.append(cluster)
ALREADY_IN_CLUSTER.update(cluster)
# 5) Dumping the clusters
for i, cluster in enumerate(CLUSTERS):
print('Cluster n°%i containing:' % (i + 1))
for artist in cluster:
sequence = ARTISTS_SEQUENCES[artist]
print(' - %s (%s)' % (artist, sequence))
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment