Skip to content

Instantly share code, notes, and snippets.

@rjollet
Created April 25, 2016 08:43
Show Gist options
  • Save rjollet/2e56a3f770cac6f47df404c73e3e94c9 to your computer and use it in GitHub Desktop.
Save rjollet/2e56a3f770cac6f47df404c73e3e94c9 to your computer and use it in GitHub Desktop.
import json
import argparse
parser = argparse.ArgumentParser(description='Compare two corpus of words each corpus is a text file containing one word per line.')
parser.add_argument("--corpus1", help="file contening the corpus1")
parser.add_argument("--corpus2", help="file file contening the corpus2")
args = parser.parse_args()
if(args.corpus1 and args.corpus2):
corpus1 = set(open(args.corpus1,'r'))
corpus2 = set(open(args.corpus2,'r'))
#Jaccard similarity intersection over union
intersection = corpus1 & corpus2
union = corpus1 | corpus2
res = {
"Jaccard": len(intersection) / len(union)
, "intersection / corpus1": len(intersection) / len(corpus1)
, "intersection / corpus2": len(intersection) / len(corpus2)
}
print(json.dumps(res, indent = 4))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment