Skip to content

Instantly share code, notes, and snippets.

@jokester
Created January 7, 2013 17:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jokester/4476924 to your computer and use it in GitHub Desktop.
Save jokester/4476924 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
"""minimalign.py: minimal version of anymalign.py
Adrien Lardilleux <Adrien.Lardilleux@limsi.fr>
http://users.info.unicaen.fr/~alardill/anymalign/
"""
import sys
import random
NB_SAMPLES = 10 # The larger, the more alignments
# Read input file and load bicorpus into memory, as a list of pairs of
# sentences (1 sentence = 1 list of words)
sourceFile = open(sys.argv[1], 'r')
targetFile = open(sys.argv[2], 'r')
corpus = zip((line.split() for line in sourceFile),
(line.split() for line in targetFile))
sourceFile.close()
targetFile.close()
allAlignments = {} # Simple counter {alignmentString: integerCount, ...}
# Main loop
for i in xrange(NB_SAMPLES):
# Select a random subcorpus
subcorpusSize = random.randrange(0, len(corpus))
selection = random.sample(xrange(len(corpus)), subcorpusSize)
# Assign to each word of the subcorpus the line ids it appears on
sourceWord_vec = {} # {string: [lineNo, ...], ...}
targetWord_vec = {}
for lineId in selection:
sourceSentence, targetSentence = corpus[lineId]
for word in sourceSentence:
if word not in sourceWord_vec:
sourceWord_vec[word] = []
sourceWord_vec[word].append(lineId)
for word in targetSentence:
if word not in targetWord_vec:
targetWord_vec[word] = []
targetWord_vec[word].append(lineId)
# Group words according to the lines they appear on
vec_words = {} # {tupleOfLineNos: ([srcWord, ...], [tgtWord, ...]), ...}
for word in sourceWord_vec:
vec = tuple(sourceWord_vec[word])
if vec not in vec_words:
vec_words[vec] = ([], [])
vec_words[vec][0].append(word)
for word in targetWord_vec:
vec = tuple(targetWord_vec[word])
if vec in vec_words:
vec_words[vec][1].append(word)
# else: there will not be any alignment since the source part is empty
# For each group of words, make a new pass on the subcorpus to extract
# alignments and their contexts
for vec in vec_words:
sourceWords, targetWords = vec_words[vec]
if not targetWords: # target part is empty -> no alignment
continue
sourceSet = set(sourceWords) # Speed up searches
targetSet = set(targetWords)
for lineId in vec:
sourceSentence, targetSentence = corpus[lineId]
sourceAl = [] # Same words as in <sourceSet>, but ordered
targetAl = []
sourceCont = [] # Complementary of <sourceAl> on the line
targetCont = []
for word in sourceSentence:
if word in sourceSet:
sourceAl.append(word)
else:
sourceCont.append(word)
for word in targetSentence:
if word in targetSet:
targetAl.append(word)
else:
targetCont.append(word)
# We get alignments only if both the source and the target parts
# actually contain words. If so, increase alignment's count.
if sourceAl and targetAl:
alignment = "%s\t%s" % (" ".join(sourceAl), " ".join(targetAl))
if alignment not in allAlignments:
allAlignments[alignment] = 0
allAlignments[alignment] += 1
if sourceCont and targetCont:
alignment = "%s\t%s" % (" ".join(sourceCont),
" ".join(targetCont))
if alignment not in allAlignments:
allAlignments[alignment] = 0
allAlignments[alignment] += 1
# End of main loop
# Sort all alignments according to their count and output everything
allAlignments = allAlignments.items()
allAlignments.sort(key=lambda x:x[1], reverse=True)
for alignment, count in allAlignments:
print "%s\t%i" % (alignment, count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment