Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Process raw Wikipedia data and create Anki deck for collective nouns
import sys
import csv
###################################################################
# Handling inputs
###################################################################
def getInput(csvFileName):
with file(csvFileName) as f:
reader = csv.DictReader(f, fieldnames=['Subject', 'CollectiveNoun', 'Notes', 'Source'])
next(reader, None) # Skip the first line
subToNoun = {}
nounToSub = {}
hints = {}
allPairs = set()
for row in reader:
sub = row['Subject']
# Select only the first collective noun, if many
coll = row['CollectiveNoun'].split()[0]
notes = row['Notes']
if notes is not None and len(notes) > 0:
hints[(sub, coll)] = notes
if sub not in subToNoun: subToNoun[sub] = []
if coll not in nounToSub: nounToSub[coll] = []
subToNoun[sub].append(coll)
nounToSub[coll].append(sub)
# Only select those pairs where the column vector is more than a
# single character long (exclude single alphabet columns)
if len(coll) > 1:
allPairs.add((sub, coll))
return (subToNoun, nounToSub, hints, allPairs)
###################################################################
# Processing
###################################################################
def cleanStr(s):
return s.strip().strip('*').strip()
def minPrefixIdx(targetStr, strs):
# Remove the targetStr from strs, if it is present
strs = [ s for s in strs if s != targetStr ]
# If there is only one string, then no prefix is needed
if len(strs) == 0:
return 0
idx = 1
while idx < len(targetStr):
conflict = False
for s in strs:
if targetStr[0:idx] == s[0:idx]:
conflict = True
break
if conflict:
idx += 1
else:
return idx
print >> sys.stderr, "Something went wrong: {0}, {1}".format(targetStr, str(strs))
# One string was completely a prefix of another!
return None
def createCloze(subject, collectiveNoun, nounToSub, subToNoun, hintsDict):
colls = [cleanStr(s) for s in subToNoun[subject]]
subs = [cleanStr(s) for s in nounToSub[collectiveNoun]]
hint = cleanStr(hintsDict.get((subject, collectiveNoun), ''))
sub = cleanStr(subject)
coll = cleanStr(collectiveNoun)
collsPrefixIdx = minPrefixIdx(coll, colls)
subsPrefixIdx = minPrefixIdx(sub, subs)
if collsPrefixIdx is not None and subsPrefixIdx is not None:
cloze = '{collPrefix}{{{{c2::{restColl}}}}} of {subPrefix}{{{{c1::{restSubject}}}}}' \
.format( subPrefix = sub[:subsPrefixIdx]
, restSubject = sub[subsPrefixIdx:]
, collPrefix = coll[:collsPrefixIdx]
, restColl = coll[collsPrefixIdx:]
)
return (cloze, hint)
else:
# Unfortunately, could not find a proper cloze deletion for
# this item. Skipping it.
print >> sys.stderr, "Skipping ({0}, {1}).".format(sub, coll)
return None
###################################################################
# Execution and output
###################################################################
def process(inputFileName, outputFileName):
(subToNoun, nounToSub, hints, allPairs) = getInput(inputFileName)
opAll = [ createCloze(x, y, nounToSub, subToNoun, hints) for x, y in allPairs ]
op = [ x for x in opAll if x is not None ]
with file(outputFileName, 'w') as f:
w = csv.writer(f, )
w.writerows(op)
Subject Collective noun Notes Sources
*A* A !
*alligators* congregation [1]
*antelopes* *herd* [2][3]
*ants* army [1]
*ants* colony [1][2]
*ants* nest [1]
*ants* swarm [1]
*apes* shrewdness [3][4][5]
*apes* troop [1]
*asses* *herd* [1]
*asses* drove driven in a group [1]
*asses* pace or passe [1][2]
*B* B !
*baboons* troop [1][5]
*badgers* cete [1][2][5][6]
*bats* colony [2]
*bears* sleuth or sloth [1][2][5]
*beavers* colony [1]
*beavers* family [1]
*bees* *swarm* [5]
*bees* drift [5]
*bees* erst [5]
*bees* grist [2]
*birds* *flock* [5]
*birds* dissimulation fanciful [4]
*birds* flight [5]
*birds* pod [5]
*bitterns* sedge or sege [2][4][7]
*boars* (wild boars) sounder 12 or more [6]
*buffalo* *herd* [2][5]
*bullfinches* bellowing [5]
*bullocks* drove driven in a group [5]
*butterflies* flutter [8]
*butterflies* *swarm* [9]
*buzzards* wake [1][2]
*C* C !
*camels* caravan [1]
*camels* flock [1]
*camels* train [1]
*capercaillie* tok [10][11]
*capons* mews [10]
*caribou* herd [10]
*cats* clowder [1][2][10]
*cats* cluster or clutter [1][10]
*cats* glaring [1][10]
*cats* pounce [1][2]
*cats* destruction wild cats (feral cats) [5][12]
*caterpillars* army [1]
*cattle* *herd* [1][2]
*cattle* drove [1][2]
*chickens* brood [1][5]
*chickens* clutch [5]
*chickens* peep [1][5]
*choughs* chattering or clattering [4][5][6][13][14]
*cockroaches* intrusion [1]
*colts* rag [2][5][6]
*colts* rake [5]
*cormorants* gulp [1][2]
*coots* covert [4][5]
*coyotes* *pack* [10]
*cranes* herd [4][5]
*cranes* sedge or sege [2][7][10]
*crocodiles* bask [1][2][10]
*crocodiles* float [1]
*crows* murder [1][2][5][6]
*curlews* herd [4][5][6]
*D* D !
*deer* *herd* [15]
*deer* bunch [15]
*deer* mob [15]
*deer* parcel [16]
*deer* rangale [15]
*dogs* *pack* [15]
*dogfish* troop [15]
*dolphins* *pod* [17][18]
*dolphins* school [15]
*dolphins* team [15]
*dotterel* trip [2][5][15][19]
*doves* dole or dule fanciful [2][4][5][15]
*doves* flight [1][5][15]
*doves* piteousness fanciful [5][15]
*ducks* *flock* [1]
*ducks* badling [2]
*ducks* bunch on water [5][15]
*ducks* paddling on water [1][2][5][15]
*ducks* plump [20][21]
*ducks* raft on water [1][2][5][15]
*ducks* skein in flight [15]
*ducks* sord or sore [15]
*ducks* string in flight [15]
*ducks* team in flight [2][15]
*ducks* waddling [15]
*dunlins* fling [5]
*E* E !
*eagles* convocation [1][2]
*eels* bed [1]
*eels* swarm [1]
*elephants* *herd* [1][2]
*elephants* memory [1]
*elk* gang [1][5][6]
*elk* herd [1][5]
*emus* *mob* [1]
*F* F !
*ferrets* business corruption over time of 'busyness' [1][2][6]
*ferrets* busyness [4]
*ferrets* besyness esynes esnyng eamyng ghost words arising from successive misspellings of "busyness" [22]
*finches* charm or chirm [1][2][5]
*fish* *school* [1][2]
*fish* shoal [1][2]
*flamingoes* stand [1][2]
*flies* business [1][2]
*flies* cloud [1]
*flies* swarm [1]
*foxes* leash [1][2]
*foxes* skulk [1][2]
*frogs* army [1][2]
*G* G !
*geese* *flock* on land [2][23]
*geese* gaggle on land [1][2][23]
*geese* plump flying close [23]
*geese* skein in flight [1][2][23]
*geese* team in flight [23]
*geese* wedge in flight [23]
*giraffes* *herd* [23]
*giraffes* corps [23]
*giraffes* tower [2][23]
*gnats* *cloud* [1][2][5][23]
*gnats* horde [1][2][23]
*gnats* rabble [23]
*gnats* swarm [1][23]
*goats* *flock* [23]
*goats* *herd* [23]
*goats* tribe [23]
*goats* trip or trippe [23]
*goldfinches* charm [23]
*goosanders* dopping [23]
*gorillas* band [23]
*gorillas* whoop [23]
*goshawks* flight [4][23]
*grasshoppers* cloud [2][23]
*grasshoppers* swarm [23]
*grouse* covey [5][23]
*grouse* pack [5][23]
*guineafowl* rasp [24]
*H* H !
*hares* down [25]
*hares* drove [25]
*hares* flick [25]
*hares* husk [25]
*hawks* boil two or more spiralling in flight [1][2]
*hawks* cast [1][2][4][5]
*hawks* kettle flying in large numbers [1][2]
*hawks* lease [4]
*harts* herd [25]
*hedgehogs* array [25]
*herons* sedge or sege [1][2][4][6][7][25]
*herons* siege [1][2][5][25]
*hippopotamuses* bloat [1][2][25]
*hogs* parcel *See also* boars, pigs, swine [26][27]
*hornets* bike [25]
*hornets* nest [2]
*hornets* swarm [25]
*horses* *team* in harness [1][2][25]
*horses* band [1]
*horses* haras or harrase [2][25]
*horses* herd [1][25]
*horses* stable [1][25]
*horses* string [1][2][25]
*horses* stud [2][25]
*hounds* *pack* [25]
*hounds* cry [25]
*hounds* hunt [25]
*hounds* meet [25]
*hounds* mute [25]
*hounds* stable [25]
*hyenas* cackle [1][2]
*I* I !
*ibex* herd [28]
*ibises* colony [28]
*iguanas* mess [28]
*insects* flight [28]
*insects* horde [28]
*insects* plague [28]
*insects* rabble [28]
*insects* swarm [28]
*J* J !
*jackdaws* clattering [29]
*jackdaws* train [29][30]
*jays* band [1]
*jays* party [1][2]
*jays* scold [1][2]
*jellyfish* fluther [5]
*jellyfish* smack [1][2][5]
*K* K !
*kangaroos* mob [1]
*kangaroos* troop [1][2]
*kittens* kindle [5]
*L* L !
*lapwings* deceit or desert [2][4][5][6][31]
*larks* bevy [5]
*larks* exaltation [4][5][6]
*leopards* leap or lepe [1][2][5]
*lions* *pride* [1][2][5]
*lions* sawt [5]
*locusts* plague [2]
*M* M !
*mackerel* shoal [32]
*magpies* charm [2]
*magpies* congregation [32]
*magpies* gulp [2]
*magpies* murder [2]
*magpies* tiding [2][4][5][32]
*magpies* tittering [32]
*mallards* flush [32]
*mallards* puddling [32]
*mallards* sord [2][5][32]
*mallards* suit or sute [5][32]
*mares* stud [5][6][32]
*martens* richesse or richness [2][5][32]
*mice* nest [1][32]
*mice* mischief [33]
*mice* trip [32]
*minnows* shoal [32]
*moles* company [32]
*moles* labor or labour [1][2][5][32]
*monkeys* barrel [1][2]
*monkeys* cartload [32]
*monkeys* tribe [32]
*monkeys* troop [2][5][32]
*monkeys* wilderness [34]
*moorhens* plump [32]
*moose* *herd* [1][32]
*mosquitoes* scourge [32]
*mudhens* fleet [32]
*mules* barren [1][2][5][6][32]
*mules* pack [2][32]
*mules* rake [32]
*mules* span [1][2][32]
*N* N !
*nightingales* watch [4][5]
*O* O !
*owls* parliament [1][2][6]
*oxen* drove [1]
*oxen* team [1][2]
*oxen* yoke [1][2]
*oysters* bed [1][2]
*P* P !
*parrots* company [1][2]
*parrots* pandemonium [1]
*partridges* covey [2][5]
*peacocks* muster [2][5][6]
*peacocks* ostentation [1][2]
*peacocks* pride [1]
*pelicans* pod [1]
*pheasants* bouquet [1][2]
*pheasants* head [5]
*pheasants* nest [2]
*pheasants* nide [2]
*pheasants* nye [1][2][5][6]
*pigs* drift *See also* boars, hogs, swine [1][2]
*pigs* drove [1][2]
*pigeons* kit flying together [5]
*plovers* congregation [4][5]
*plovers* stand [5]
*plovers* wing [5]
*Q* Q !
*quail* bevy [1][2][4][5]
*quail* covey [1][2]
*quail* drift [5]
*R* R !
*raccoons* gaze [1]
*ravens* unkindness [4][5]
*rhinoceroses* crash [1][2][5]
*rooks* building [4][5]
*rooks* parliament [5][6]
*S* S !
*salmon* bind [35]
*salmon* draught [35]
*salmon* run [35]
*sandpipers* fling [35]
*seals* herd [35]
*seals* pod [35]
*seals* rookery [35]
*sea urchins* herd [36][37]
*sheep* *flock* [1][2][35]
*sheep* down [35]
*sheep* drift when driven in a group [35]
*sheep* drove when driven in a group [2][35]
*sheep* herd [2][35]
*sheep* hurtle [1][35]
*sheep* meinie [35]
*sheep* mob [1]
*sheep* parcel [35]
*sheep* trip [35]
*sheldrake* dopping [35][38]
*snails* escargatoire [35]
*snails* rout [35]
*snails* walk [35]
*snakes* bed [1]
*snakes* den [1]
*snakes* knot [1]
*snakes* nest [2]
*snakes* pit [1]
*snipes* walk [5][6][35]
*snipes* whisp or wisp [5][35]
*sparrows* host [1][2][4][5][35]
*sparrows* meinie [35]
*sparrows* tribe [35]
*spiders* cluster or clutter [1][35]
*squirrels* colony [35]
*squirrels* dray [1][2]
*squirrels* scurry [1][2]
*starlings* chattering [1][13][35]
*starlings* clattering [35]
*starlings* cloud [35]
*starlings* congregation [35]
*starlings* murmuration [1][4][5][6][35]
*stoats* pack [35]
*stoats* trip [35]
*storks* flight [35]
*storks* mustering [1][2][35]
*storks* phalanx when migrating [35]
*swallows* flight [1][2][35]
*swallows* gulp [35]
*swans* bank on the ground [1][35]
*swans* bevy [1][2][35]
*swans* drift [35]
*swans* eyrar [35]
*swans* flight [1]
*swans* game [5][35]
*swans* herd [1][35]
*swans* lamentation fanciful [35]
*swans* sownder [35]
*swans* team [35]
*swans* wedge in flight [1][2][5][35]
*swans* whiting [35]
*swifts* *flock* [35]
*swifts* scream [39]
*swine* doylt *See also* boars, hogs, pigs [35]
*swine* drift [6][35]
*swine* trip [35]
*T* T !
*teal* spring [2][4][5]
*tigers* ambush [1]
*tigers* streak [1][2]
*toads* knot [1][2]
*toads* nest [1]
*trout* hover [1][2]
*turkeys* gang [2]
*turkeys* rafter [1][2]
*turtles* bale [1][2]
*turtles* dole [1]
*turtles* nest [2]
*turtle doves* dole or dule [4]
*turtle doves* pitying [2]
*U* U !
*V* V !
*vipers* nest [2]
*W* W !
*walruses* herd [2]
*walruses* pod [2]
*waterfowl* bunch [5]
*waterfowl* knob [5]
*waterfowl* raft [5]
*weasels* colony [1]
*weasels* pack [1]
*whales* gam [1][2]
*whales* herd [2]
*whales* mob [1]
*whales* pod [1][2]
*wigeons* company [2][5]
*wigeons* trip [5]
*wildfowl* bunch [5]
*wildfowl* knob fewer than 30 [5]
*wildfowl* plump [5]
*wildfowl* trip [5]
*wolves* *pack* [1][2]
*wolves* rout or route when in movement [1][2]
*wombats* wisdom [40]
*woodcocks* fall [2][4][5]
*woodpeckers* descent [1][2][5]
*wrens* herd [4][5]
*X* X !
*Y* Y !
*Z* Z !
*zebras* *herd* [1][41]
*zebras* cohort [41]
*zebras* dazzle [42]
*zebras* zeal [1][41]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment