Skip to content

Instantly share code, notes, and snippets.

@Poorvak
Created June 8, 2017 07:26
Show Gist options
  • Save Poorvak/85a8494b2997d84179dcef6605bf7a5b to your computer and use it in GitHub Desktop.
Save Poorvak/85a8494b2997d84179dcef6605bf7a5b to your computer and use it in GitHub Desktop.
NER Term Frquency
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Packages used for model training and machine learning\n",
"import sys\n",
"import operator\n",
"\n",
"# import gensim\n",
"from nltk.tree import Tree\n",
"from sklearn.externals import joblib\n",
"from nltk import ne_chunk, pos_tag, word_tokenize"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"try:\n",
" filename = sys.argv[2]\n",
"except:\n",
" filename = \".sample_test\"\n",
"sample = joblib.load(filename=filename)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('madhya pradesh', 5), ('rahul gandhi', 3), ('mandsaur', 3), ('congress', 3), ('delhi', 2), ('indian', 2), ('gujarat', 2), ('raf pipliamandi', 2), ('rapid action force', 2), ('raf', 2), ('congress madhya', 2), ('op tripathi', 1), ('neemuch congress', 1), ('mandsaur raf', 1), ('bahujan samaj party', 1), ('mandsaur congress', 1), ('dig ratlam', 1), ('indian express', 1), ('ajay singh', 1), ('neemuch manoj kumar singh', 1), ('rastriya kisan mazdoor sangh', 1), ('rahul gandhi mandsaur', 1), ('mandsaur rajasthan', 1), ('raf garoth', 1), ('rajasthan', 1), ('neemuch sp manoj kumar singh gandhi', 1), ('bollywood', 1), ('india rbi', 1), ('shivraj singh chouhan', 1), ('madhya', 1), ('gandhi', 1), ('ncrb', 1), ('superintendent', 1), ('sunil goud', 1), ('collector', 1), ('madhya congress', 1), ('raf chouhan', 1), ('entertainment', 1), ('state', 1), ('cag congress', 1), ('new', 1), ('sonkach', 1), ('bhopal', 1), ('india indian', 1), ('rashtriya swayamsevak', 1), ('pipliamandi', 1), ('gandhi congress', 1), ('sharad yadav', 1), ('national crime records bureau', 1), ('madhya pradesh shivraj singh chouhan', 1), ('dewas', 1), ('avinash sharma', 1), ('nda', 1), ('sharad yadav district magistrate', 1), ('sangh', 1), ('maharashtra', 1), ('congress congress', 1), ('rashtriya kisan mazdoor sangh', 1), ('vidarbha', 1), ('uttar', 1), ('mantralaya', 1), ('gandhi mandsaur', 1), ('mandsaur sp', 1), ('madhya pradesh madhya pradesh', 1), ('neemuch sp manoj kumar singh', 1), ('janata dal united', 1), ('bjp', 1), ('garoth', 1), ('congress ajay singh', 1), ('madhya nayagaon', 1), ('congress sp', 1), ('bjp indian', 1), ('ani', 1), ('nayagaon', 1), ('janata dal', 1), ('rahul', 1), ('arun yadav gandhi', 1), ('bhupendra singh', 1), ('superintendent mandsaur', 1), ('mandsaur mandsaur collector swatantra kumar singh', 1), ('reserve bank', 1), ('arun yadav', 1), ('gujarat madhya pradesh', 1), ('mandsaur farmers', 1), ('indore', 1), ('cag', 1), ('india india', 1), ('madhya pradesh tamil nadu', 1), ('mandsaur gandhi', 1), ('india', 1), ('mandsaur shivraj singh chouhan', 1), ('manoj kumar singh', 1), ('op shrivastava', 1), ('united', 1), ('singh', 1), ('centre', 1), ('aicc', 1), ('madhya pradesh rajasthan', 1), ('neemuch', 1), ('congress bjp', 1), ('sp', 1), ('abhishek singhvi', 1), ('district magistrate', 1), ('uttar pradesh', 1)]\n"
]
}
],
"source": [
"def get_continuous_chunks(text):\n",
" chunked = ne_chunk(pos_tag(word_tokenize(text)))\n",
" prev = None\n",
" continuous_chunk = []\n",
" current_chunk = []\n",
" for i in chunked:\n",
" if type(i) == Tree:\n",
" current_chunk.append(\" \".join([token for token, pos in i.leaves()]))\n",
" elif current_chunk:\n",
" named_entity = \" \".join(current_chunk)\n",
" if named_entity not in continuous_chunk:\n",
" continuous_chunk.append(named_entity)\n",
" current_chunk = []\n",
" else:\n",
" continue\n",
" return continuous_chunk\n",
"\n",
" \n",
"def create_ner_tags(samples, *args, **kwargs):\n",
" ner_tf = dict()\n",
" for sample in samples:\n",
" ner = get_continuous_chunks(text=sample)\n",
" for word in ner:\n",
" if word.lower() in ner_tf:\n",
" ner_tf[word.lower()] += 1\n",
" else:\n",
" ner_tf[word.lower()] = 1\n",
" return ner_tf\n",
"\n",
"ner_tf = create_ner_tags(samples=sample)\n",
"print sorted(ner_tf.items(), key=operator.itemgetter(1), reverse=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment