Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Process raw Wikipedia data and create Anki deck for collective nouns
import sys
import csv
###################################################################
# Handling inputs
###################################################################
def getInput(csvFileName):
with file(csvFileName) as f:
reader = csv.DictReader(f, fieldnames=['Subject', 'CollectiveNoun', 'Notes', 'Source'])
next(reader, None) # Skip the first line
subToNoun = {}
nounToSub = {}
hints = {}
allPairs = set()
for row in reader:
sub = row['Subject']
# Select only the first collective noun, if many
coll = row['CollectiveNoun'].split()[0]
notes = row['Notes']
if notes is not None and len(notes) > 0:
hints[(sub, coll)] = notes
if sub not in subToNoun: subToNoun[sub] = []
if coll not in nounToSub: nounToSub[coll] = []
subToNoun[sub].append(coll)
nounToSub[coll].append(sub)
# Only select those pairs where the column vector is more than a
# single character long (exclude single alphabet columns)
if len(coll) > 1:
allPairs.add((sub, coll))
return (subToNoun, nounToSub, hints, allPairs)
###################################################################
# Processing
###################################################################
def cleanStr(s):
return s.strip().strip('*').strip()
def minPrefixIdx(targetStr, strs):
# Remove the targetStr from strs, if it is present
strs = [ s for s in strs if s != targetStr ]
# If there is only one string, then no prefix is needed
if len(strs) == 0:
return 0
idx = 1
while idx < len(targetStr):
conflict = False
for s in strs:
if targetStr[0:idx] == s[0:idx]:
conflict = True
break
if conflict:
idx += 1
else:
return idx
print >> sys.stderr, "Something went wrong: {0}, {1}".format(targetStr, str(strs))
# One string was completely a prefix of another!
return None
def createCloze(subject, collectiveNoun, nounToSub, subToNoun, hintsDict):
colls = [cleanStr(s) for s in subToNoun[subject]]
subs = [cleanStr(s) for s in nounToSub[collectiveNoun]]
hint = cleanStr(hintsDict.get((subject, collectiveNoun), ''))
sub = cleanStr(subject)
coll = cleanStr(collectiveNoun)
collsPrefixIdx = minPrefixIdx(coll, colls)
subsPrefixIdx = minPrefixIdx(sub, subs)
if collsPrefixIdx is not None and subsPrefixIdx is not None:
cloze = '{collPrefix}{{{{c2::{restColl}}}}} of {subPrefix}{{{{c1::{restSubject}}}}}' \
.format( subPrefix = sub[:subsPrefixIdx]
, restSubject = sub[subsPrefixIdx:]
, collPrefix = coll[:collsPrefixIdx]
, restColl = coll[collsPrefixIdx:]
)
return (cloze, hint)
else:
# Unfortunately, could not find a proper cloze deletion for
# this item. Skipping it.
print >> sys.stderr, "Skipping ({0}, {1}).".format(sub, coll)
return None
###################################################################
# Execution and output
###################################################################
def process(inputFileName, outputFileName):
(subToNoun, nounToSub, hints, allPairs) = getInput(inputFileName)
opAll = [ createCloze(x, y, nounToSub, subToNoun, hints) for x, y in allPairs ]
op = [ x for x in opAll if x is not None ]
with file(outputFileName, 'w') as f:
w = csv.writer(f, )
w.writerows(op)
Subject Collective noun Notes Sources
*A* A !
*alligators* congregation [1]
*antelopes* *herd* [2][3]
*ants* army [1]
*ants* colony [1][2]
*ants* nest [1]
*ants* swarm [1]
*apes* shrewdness [3][4][5]
*apes* troop [1]
*asses* *herd* [1]
*asses* drove driven in a group [1]
*asses* pace or passe [1][2]
*B* B !
*baboons* troop [1][5]
*badgers* cete [1][2][5][6]
*bats* colony [2]
*bears* sleuth or sloth [1][2][5]
*beavers* colony [1]
*beavers* family [1]
*bees* *swarm* [5]
*bees* drift [5]
*bees* erst [5]
*bees* grist [2]
*birds* *flock* [5]
*birds* dissimulation fanciful [4]
*birds* flight [5]
*birds* pod [5]
*bitterns* sedge or sege [2][4][7]
*boars* (wild boars) sounder 12 or more [6]
*buffalo* *herd* [2][5]
*bullfinches* bellowing [5]
*bullocks* drove driven in a group [5]
*butterflies* flutter [8]
*butterflies* *swarm* [9]
*buzzards* wake [1][2]
*C* C !
*camels* caravan [1]
*camels* flock [1]
*camels* train [1]
*capercaillie* tok [10][11]
*capons* mews [10]
*caribou* herd [10]
*cats* clowder [1][2][10]
*cats* cluster or clutter [1][10]
*cats* glaring [1][10]
*cats* pounce [1][2]
*cats* destruction wild cats (feral cats) [5][12]
*caterpillars* army [1]
*cattle* *herd* [1][2]
*cattle* drove [1][2]
*chickens* brood [1][5]
*chickens* clutch [5]
*chickens* peep [1][5]
*choughs* chattering or clattering [4][5][6][13][14]
*cockroaches* intrusion [1]
*colts* rag [2][5][6]
*colts* rake [5]
*cormorants* gulp [1][2]
*coots* covert [4][5]
*coyotes* *pack* [10]
*cranes* herd [4][5]
*cranes* sedge or sege [2][7][10]
*crocodiles* bask [1][2][10]
*crocodiles* float [1]
*crows* murder [1][2][5][6]
*curlews* herd [4][5][6]
*D* D !
*deer* *herd* [15]
*deer* bunch [15]
*deer* mob [15]
*deer* parcel [16]
*deer* rangale [15]
*dogs* *pack* [15]
*dogfish* troop [15]
*dolphins* *pod* [17][18]
*dolphins* school [15]
*dolphins* team [15]
*dotterel* trip [2][5][15][19]
*doves* dole or dule fanciful [2][4][5][15]
*doves* flight [1][5][15]
*doves* piteousness fanciful [5][15]
*ducks* *flock* [1]
*ducks* badling [2]
*ducks* bunch on water [5][15]
*ducks* paddling on water [1][2][5][15]
*ducks* plump [20][21]
*ducks* raft on water [1][2][5][15]
*ducks* skein in flight [15]
*ducks* sord or sore [15]
*ducks* string in flight [15]
*ducks* team in flight [2][15]
*ducks* waddling [15]
*dunlins* fling [5]
*E* E !
*eagles* convocation [1][2]
*eels* bed [1]
*eels* swarm [1]
*elephants* *herd* [1][2]
*elephants* memory [1]
*elk* gang [1][5][6]
*elk* herd [1][5]
*emus* *mob* [1]
*F* F !
*ferrets* business corruption over time of 'busyness' [1][2][6]
*ferrets* busyness [4]
*ferrets* besyness esynes esnyng eamyng ghost words arising from successive misspellings of "busyness" [22]
*finches* charm or chirm [1][2][5]
*fish* *school* [1][2]
*fish* shoal [1][2]
*flamingoes* stand [1][2]
*flies* business [1][2]
*flies* cloud [1]
*flies* swarm [1]
*foxes* leash [1][2]
*foxes* skulk [1][2]
*frogs* army [1][2]
*G* G !
*geese* *flock* on land [2][23]
*geese* gaggle on land [1][2][23]
*geese* plump flying close [23]
*geese* skein in flight [1][2][23]
*geese* team in flight [23]
*geese* wedge in flight [23]
*giraffes* *herd* [23]
*giraffes* corps [23]
*giraffes* tower [2][23]
*gnats* *cloud* [1][2][5][23]
*gnats* horde [1][2][23]
*gnats* rabble [23]
*gnats* swarm [1][23]
*goats* *flock* [23]
*goats* *herd* [23]
*goats* tribe [23]
*goats* trip or trippe [23]
*goldfinches* charm [23]
*goosanders* dopping [23]
*gorillas* band [23]
*gorillas* whoop [23]
*goshawks* flight [4][23]
*grasshoppers* cloud [2][23]
*grasshoppers* swarm [23]
*grouse* covey [5][23]
*grouse* pack [5][23]
*guineafowl* rasp [24]
*H* H !
*hares* down [25]
*hares* drove [25]
*hares* flick [25]
*hares* husk [25]
*hawks* boil two or more spiralling in flight [1][2]
*hawks* cast [1][2][4][5]
*hawks* kettle flying in large numbers [1][2]
*hawks* lease [4]
*harts* herd [25]
*hedgehogs* array [25]
*herons* sedge or sege [1][2][4][6][7][25]
*herons* siege [1][2][5][25]
*hippopotamuses* bloat [1][2][25]
*hogs* parcel *See also* boars, pigs, swine [26][27]
*hornets* bike [25]
*hornets* nest [2]
*hornets* swarm [25]
*horses* *team* in harness [1][2][25]
*horses* band [1]
*horses* haras or harrase [2][25]
*horses* herd [1][25]
*horses* stable [1][25]
*horses* string [1][2][25]
*horses* stud [2][25]
*hounds* *pack* [25]
*hounds* cry [25]
*hounds* hunt [25]
*hounds* meet [25]
*hounds* mute [25]
*hounds* stable [25]
*hyenas* cackle [1][2]
*I* I !
*ibex* herd [28]
*ibises* colony [28]
*iguanas* mess [28]
*insects* flight [28]
*insects* horde [28]
*insects* plague [28]
*insects* rabble [28]
*insects* swarm [28]
*J* J !
*jackdaws* clattering [29]
*jackdaws* train [29][30]
*jays* band [1]
*jays* party [1][2]
*jays* scold [1][2]
*jellyfish* fluther [5]
*jellyfish* smack [1][2][5]
*K* K !
*kangaroos* mob [1]
*kangaroos* troop [1][2]
*kittens* kindle [5]
*L* L !
*lapwings* deceit or desert [2][4][5][6][31]
*larks* bevy [5]
*larks* exaltation [4][5][6]
*leopards* leap or lepe [1][2][5]
*lions* *pride* [1][2][5]
*lions* sawt [5]
*locusts* plague [2]
*M* M !
*mackerel* shoal [32]
*magpies* charm [2]
*magpies* congregation [32]
*magpies* gulp [2]
*magpies* murder [2]
*magpies* tiding [2][4][5][32]
*magpies* tittering [32]
*mallards* flush [32]
*mallards* puddling [32]
*mallards* sord [2][5][32]
*mallards* suit or sute [5][32]
*mares* stud [5][6][32]
*martens* richesse or richness [2][5][32]
*mice* nest [1][32]
*mice* mischief [33]
*mice* trip [32]
*minnows* shoal [32]
*moles* company [32]
*moles* labor or labour [1][2][5][32]
*monkeys* barrel [1][2]
*monkeys* cartload [32]
*monkeys* tribe [32]
*monkeys* troop [2][5][32]
*monkeys* wilderness [34]
*moorhens* plump [32]
*moose* *herd* [1][32]
*mosquitoes* scourge [32]
*mudhens* fleet [32]
*mules* barren [1][2][5][6][32]
*mules* pack [2][32]
*mules* rake [32]
*mules* span [1][2][32]
*N* N !
*nightingales* watch [4][5]
*O* O !
*owls* parliament [1][2][6]
*oxen* drove [1]
*oxen* team [1][2]
*oxen* yoke [1][2]
*oysters* bed [1][2]
*P* P !
*parrots* company [1][2]
*parrots* pandemonium [1]
*partridges* covey [2][5]
*peacocks* muster [2][5][6]
*peacocks* ostentation [1][2]
*peacocks* pride [1]
*pelicans* pod [1]
*pheasants* bouquet [1][2]
*pheasants* head [5]
*pheasants* nest [2]
*pheasants* nide [2]
*pheasants* nye [1][2][5][6]
*pigs* drift *See also* boars, hogs, swine [1][2]
*pigs* drove [1][2]
*pigeons* kit flying together [5]
*plovers* congregation [4][5]
*plovers* stand [5]
*plovers* wing [5]
*Q* Q !
*quail* bevy [1][2][4][5]
*quail* covey [1][2]
*quail* drift [5]
*R* R !
*raccoons* gaze [1]
*ravens* unkindness [4][5]
*rhinoceroses* crash [1][2][5]
*rooks* building [4][5]
*rooks* parliament [5][6]
*S* S !
*salmon* bind [35]
*salmon* draught [35]
*salmon* run [35]
*sandpipers* fling [35]
*seals* herd [35]
*seals* pod [35]
*seals* rookery [35]
*sea urchins* herd [36][37]
*sheep* *flock* [1][2][35]
*sheep* down [35]
*sheep* drift when driven in a group [35]
*sheep* drove when driven in a group [2][35]
*sheep* herd [2][35]
*sheep* hurtle [1][35]
*sheep* meinie [35]
*sheep* mob [1]
*sheep* parcel [35]
*sheep* trip [35]
*sheldrake* dopping [35][38]
*snails* escargatoire [35]
*snails* rout [35]
*snails* walk [35]
*snakes* bed [1]
*snakes* den [1]
*snakes* knot [1]
*snakes* nest [2]
*snakes* pit [1]
*snipes* walk [5][6][35]
*snipes* whisp or wisp [5][35]
*sparrows* host [1][2][4][5][35]
*sparrows* meinie [35]
*sparrows* tribe [35]
*spiders* cluster or clutter [1][35]
*squirrels* colony [35]
*squirrels* dray [1][2]
*squirrels* scurry [1][2]
*starlings* chattering [1][13][35]
*starlings* clattering [35]
*starlings* cloud [35]
*starlings* congregation [35]
*starlings* murmuration [1][4][5][6][35]
*stoats* pack [35]
*stoats* trip [35]
*storks* flight [35]
*storks* mustering [1][2][35]
*storks* phalanx when migrating [35]
*swallows* flight [1][2][35]
*swallows* gulp [35]
*swans* bank on the ground [1][35]
*swans* bevy [1][2][35]
*swans* drift [35]
*swans* eyrar [35]
*swans* flight [1]
*swans* game [5][35]
*swans* herd [1][35]
*swans* lamentation fanciful [35]
*swans* sownder [35]
*swans* team [35]
*swans* wedge in flight [1][2][5][35]
*swans* whiting [35]
*swifts* *flock* [35]
*swifts* scream [39]
*swine* doylt *See also* boars, hogs, pigs [35]
*swine* drift [6][35]
*swine* trip [35]
*T* T !
*teal* spring [2][4][5]
*tigers* ambush [1]
*tigers* streak [1][2]
*toads* knot [1][2]
*toads* nest [1]
*trout* hover [1][2]
*turkeys* gang [2]
*turkeys* rafter [1][2]
*turtles* bale [1][2]
*turtles* dole [1]
*turtles* nest [2]
*turtle doves* dole or dule [4]
*turtle doves* pitying [2]
*U* U !
*V* V !
*vipers* nest [2]
*W* W !
*walruses* herd [2]
*walruses* pod [2]
*waterfowl* bunch [5]
*waterfowl* knob [5]
*waterfowl* raft [5]
*weasels* colony [1]
*weasels* pack [1]
*whales* gam [1][2]
*whales* herd [2]
*whales* mob [1]
*whales* pod [1][2]
*wigeons* company [2][5]
*wigeons* trip [5]
*wildfowl* bunch [5]
*wildfowl* knob fewer than 30 [5]
*wildfowl* plump [5]
*wildfowl* trip [5]
*wolves* *pack* [1][2]
*wolves* rout or route when in movement [1][2]
*wombats* wisdom [40]
*woodcocks* fall [2][4][5]
*woodpeckers* descent [1][2][5]
*wrens* herd [4][5]
*X* X !
*Y* Y !
*Z* Z !
*zebras* *herd* [1][41]
*zebras* cohort [41]
*zebras* dazzle [42]
*zebras* zeal [1][41]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.