Skip to content

Instantly share code, notes, and snippets.

@devashishd12
Created July 14, 2017 04:17
Show Gist options
  • Save devashishd12/b5ef3dc5126fd0781dbf25c2d5346b6e to your computer and use it in GitHub Desktop.
Save devashishd12/b5ef3dc5126fd0781dbf25c2d5346b6e to your computer and use it in GitHub Desktop.
Mapping uri to words
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import difflib\n",
"import re\n",
"from nltk import pos_tag"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.read_json('./data_v7.json')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['_id', 'corrected_question', 'sparql_query', 'sparql_template_id',\n",
" 'verbalized_question'],\n",
" dtype='object')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Name the scientist whose supervisor was Ernest Rutherford and had a doctoral students named Charles Drummond Ellis?\n",
"\n",
"SELECT DISTINCT ?uri WHERE {?uri <http://dbpedia.org/ontology/doctoralAdvisor> <http://dbpedia.org/resource/Ernest_Rutherford> . ?uri <http://dbpedia.org/property/doctoralStudents> <http://dbpedia.org/resource/Charles_Drummond_Ellis> . ?uri <https://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Scientist>}\n",
"\n",
"What is the <scientist> whose <supervisor> is <Ernest Rutherford> and <doctoral students> is <Charles Drummond Ellis>?\n",
"\n",
"[('Name', 'NN'), ('the', 'DT'), ('scientist', 'NN'), ('whose', 'WP$'), ('supervisor', 'NN'), ('was', 'VBD'), ('Ernest', 'NNP'), ('Rutherford', 'NNP'), ('and', 'CC'), ('had', 'VBD'), ('a', 'DT'), ('doctoral', 'JJ'), ('students', 'NNS'), ('named', 'VBN'), ('Charles', 'NNP'), ('Drummond', 'NNP'), ('Ellis?', 'NNP')]\n"
]
}
],
"source": [
"i = 10\n",
"print(df.loc[i][1])\n",
"print()\n",
"print(df.loc[i][2])\n",
"print()\n",
"print(df.loc[i][4])\n",
"print()\n",
"print(pos_tag(df.loc[i][1].split()))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"uris = re.findall(r'http://dbpedia.org/[^>]*', df.loc[i][2])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['http://dbpedia.org/ontology/doctoralAdvisor',\n",
" 'http://dbpedia.org/resource/Ernest_Rutherford',\n",
" 'http://dbpedia.org/property/doctoralStudents',\n",
" 'http://dbpedia.org/resource/Charles_Drummond_Ellis',\n",
" 'http://dbpedia.org/ontology/Scientist']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uris"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"l = re.findall(r'\\S+[^?]', df.loc[i][1])\n",
"l = [x.strip().lower() for x in l]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"final_dict = {}\n",
"for uri in uris:\n",
" s = uri.split('/')\n",
" label, entity = s[-2], s[-1]\n",
" if label == 'resource':\n",
" entity = re.sub('_', ' ', entity).lower()\n",
" start = 1000\n",
" build = ''\n",
" for match in difflib.get_close_matches(entity, l, 5, 0):\n",
" if match in entity:\n",
" build += ' ' + match\n",
" temp = df.loc[i][1].lower().find(match)\n",
" if temp < start:\n",
" start = temp\n",
" else:\n",
" break\n",
" build = build.strip()\n",
" length = len(build)\n",
" end = start + length - 1\n",
" final_dict.update({uri: [start, end]})\n",
" else:\n",
" continue"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'http://dbpedia.org/resource/Charles_Drummond_Ellis': [92, 113],\n",
" 'http://dbpedia.org/resource/Ernest_Rutherford': [40, 56]}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_dict"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment