Created
July 14, 2017 04:17
-
-
Save devashishd12/b5ef3dc5126fd0781dbf25c2d5346b6e to your computer and use it in GitHub Desktop.
Mapping uri to words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import difflib\n", | |
"import re\n", | |
"from nltk import pos_tag" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_json('./data_v7.json')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Index(['_id', 'corrected_question', 'sparql_query', 'sparql_template_id',\n", | |
" 'verbalized_question'],\n", | |
" dtype='object')" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Name the scientist whose supervisor was Ernest Rutherford and had a doctoral students named Charles Drummond Ellis?\n", | |
"\n", | |
"SELECT DISTINCT ?uri WHERE {?uri <http://dbpedia.org/ontology/doctoralAdvisor> <http://dbpedia.org/resource/Ernest_Rutherford> . ?uri <http://dbpedia.org/property/doctoralStudents> <http://dbpedia.org/resource/Charles_Drummond_Ellis> . ?uri <https://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Scientist>}\n", | |
"\n", | |
"What is the <scientist> whose <supervisor> is <Ernest Rutherford> and <doctoral students> is <Charles Drummond Ellis>?\n", | |
"\n", | |
"[('Name', 'NN'), ('the', 'DT'), ('scientist', 'NN'), ('whose', 'WP$'), ('supervisor', 'NN'), ('was', 'VBD'), ('Ernest', 'NNP'), ('Rutherford', 'NNP'), ('and', 'CC'), ('had', 'VBD'), ('a', 'DT'), ('doctoral', 'JJ'), ('students', 'NNS'), ('named', 'VBN'), ('Charles', 'NNP'), ('Drummond', 'NNP'), ('Ellis?', 'NNP')]\n" | |
] | |
} | |
], | |
"source": [ | |
"i = 10\n", | |
"print(df.loc[i][1])\n", | |
"print()\n", | |
"print(df.loc[i][2])\n", | |
"print()\n", | |
"print(df.loc[i][4])\n", | |
"print()\n", | |
"print(pos_tag(df.loc[i][1].split()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"uris = re.findall(r'http://dbpedia.org/[^>]*', df.loc[i][2])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['http://dbpedia.org/ontology/doctoralAdvisor',\n", | |
" 'http://dbpedia.org/resource/Ernest_Rutherford',\n", | |
" 'http://dbpedia.org/property/doctoralStudents',\n", | |
" 'http://dbpedia.org/resource/Charles_Drummond_Ellis',\n", | |
" 'http://dbpedia.org/ontology/Scientist']" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"uris" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"l = re.findall(r'\\S+[^?]', df.loc[i][1])\n", | |
"l = [x.strip().lower() for x in l]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"final_dict = {}\n", | |
"for uri in uris:\n", | |
" s = uri.split('/')\n", | |
" label, entity = s[-2], s[-1]\n", | |
" if label == 'resource':\n", | |
" entity = re.sub('_', ' ', entity).lower()\n", | |
" start = 1000\n", | |
" build = ''\n", | |
" for match in difflib.get_close_matches(entity, l, 5, 0):\n", | |
" if match in entity:\n", | |
" build += ' ' + match\n", | |
" temp = df.loc[i][1].lower().find(match)\n", | |
" if temp < start:\n", | |
" start = temp\n", | |
" else:\n", | |
" break\n", | |
" build = build.strip()\n", | |
" length = len(build)\n", | |
" end = start + length - 1\n", | |
" final_dict.update({uri: [start, end]})\n", | |
" else:\n", | |
" continue" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'http://dbpedia.org/resource/Charles_Drummond_Ellis': [92, 113],\n", | |
" 'http://dbpedia.org/resource/Ernest_Rutherford': [40, 56]}" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"final_dict" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment