Created
November 17, 2019 23:09
-
-
Save aloncohen1/b4605d3fbfd7476e1ba37cc239d54946 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"result = requests.get(\"https://lotr.fandom.com/sitemap-newsitemapxml-index.xml\")\n", | |
"c = result.content" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-2-35626.xml\n", | |
"https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-35626-49264.xml\n", | |
"https://lotr.fandom.com/sitemap-newsitemapxml-NS_14-id-21-49203.xml\n", | |
"https://services.fandom.com/discussions-sitemap/sitemap/159\n" | |
] | |
} | |
], | |
"source": [ | |
"from bs4 import BeautifulSoup \n", | |
"import lxml\n", | |
"import xml.etree.ElementTree as ET\n", | |
"root = ET.fromstring(c)\n", | |
"for link in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n", | |
" print(link.text)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Found 7080 elements\n" | |
] | |
} | |
], | |
"source": [ | |
"elements = dict()\n", | |
"for page in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n", | |
" result = requests.get(page.text)\n", | |
" c = result.content\n", | |
" new_root = ET.fromstring(c)\n", | |
" for element in new_root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n", | |
" elements[element.text.split('/')[-1]] = element.text\n", | |
"print('Found {} elements'.format(len(elements)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'https://lotr.fandom.com/wiki/Carolynne_Cunningham'" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"elements['Carolynne_Cunningham']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Hallatan added to the dict, dict len: 300\n", | |
"Prisca_(Baggins)_Bolger added to the dict, dict len: 600\n", | |
"Barahir_(Fourth_Age) added to the dict, dict len: 900\n" | |
] | |
} | |
], | |
"source": [ | |
"characters_dict = {}\n", | |
"counter=0\n", | |
"for k,v in elements.items():\n", | |
" result = requests.get(v)\n", | |
" c = result.content\n", | |
" soup = BeautifulSoup(c, \"html.parser\") # parse HTML page \n", | |
" links = soup.find_all(\"div\", class_='page-header__categories-links')\n", | |
" try:\n", | |
" if '/wiki/Category:Characters' in str(links[0]):\n", | |
" characters_dict[k] = v\n", | |
" counter+=1\n", | |
" if counter in range(0,10000,300):\n", | |
" print('%s added to the dict, dict len: %s' %(k,str(len(characters_dict))))\n", | |
" except:\n", | |
" continue" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1112" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"category_dict={}\n", | |
"for k,v in characters_dict.items():\n", | |
" if 'Category:' in k:\n", | |
" category_dict[k]=v\n", | |
" del characters_dict[k]\n", | |
"len(characters_dict)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'https://lotr.fandom.com/wiki/Adaldrida_(Bolger)_Brandybuck': 'Adaldrida_(Bolger)_Brandybuck',\n", | |
" 'https://lotr.fandom.com/wiki/N%C3%A1li': 'N%C3%A1li',\n", | |
" 'https://lotr.fandom.com/wiki/Ondoher': 'Ondoher',\n", | |
" 'https://lotr.fandom.com/wiki/Thorin_II_Oakenshield': 'Thorin_II_Oakenshield'}" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import itertools\n", | |
"inverted_dict = {v.encode('utf-8'): k for k, v in characters_dict.iteritems()}\n", | |
"dict(itertools.islice(inverted_dict.items(), 0, 4))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Elendur added to the dict, dict len: 100\n", | |
"Borlad added to the dict, dict len: 200\n", | |
"Sigismond_Took added to the dict, dict len: 300\n", | |
"M%C3%ADriel added to the dict, dict len: 400\n", | |
"Idril added to the dict, dict len: 500\n", | |
"Dagor_(sergeant) added to the dict, dict len: 600\n", | |
"Baranor_(Gondor) added to the dict, dict len: 700\n", | |
"Marhari added to the dict, dict len: 800\n", | |
"Laura_(Grubb)_Baggins added to the dict, dict len: 900\n", | |
"Angelica_Baggins added to the dict, dict len: 1000\n", | |
"Arador added to the dict, dict len: 1100\n" | |
] | |
} | |
], | |
"source": [ | |
"import collections\n", | |
"from lxml import html\n", | |
"import pandas as pd\n", | |
"character_count=collections.Counter()\n", | |
"character_text=dict()\n", | |
"character_network = collections.Counter()\n", | |
"base_url = 'https://lotr.fandom.com'\n", | |
"\n", | |
"characters_details=[]\n", | |
"\n", | |
"for character_url, character_name in inverted_dict.items():\n", | |
" character_html = requests.get(character_url)\n", | |
" page = html.fromstring(character_html.text)\n", | |
" soup = BeautifulSoup(character_html.content, \"html.parser\")\n", | |
" \n", | |
" lotr_dict={'character_name':character_name,'character_url': character_url}\n", | |
" lotr_name = soup.find(\"h2\", class_=\"pi-item pi-item-spacing pi-title\")\n", | |
" if lotr_name:\n", | |
" lotr_dict['lotr_name']=lotr_name.contents[0]\n", | |
" for lotr in soup.find_all(\"div\", class_=\"pi-item\"):\n", | |
" lotr_data = lotr.text.split('\\n')\n", | |
" lotr_dict[lotr_data[1].strip().lower()] = lotr_data[2].strip().lower()\n", | |
" characters_details.append(lotr_dict)\n", | |
" \n", | |
" character = {'name':character_name, 'url':character_url, 'html_content':character_html.text } \n", | |
" character['text'] = soup.find('div',{'id':'mw-content-text'}).text \n", | |
" character['links'] = collections.Counter([inverted_dict[base_url+link.get('href').encode('utf-8')] for link in soup.find('div',{'id':'mw-content-text'}).find_all('a') if link.get('href') and (base_url+str(link.get('href').encode('utf-8'))) in inverted_dict])\n", | |
" character_count=character_count + character['links']\n", | |
" character_text[character_name]=soup.find(\"div\",{\"id\":\"mw-content-text\"}).text\n", | |
" for x in character:\n", | |
" x = character_name\n", | |
" for y in character['links']:\n", | |
" character_network[(x, y)] = character['links'][y]\n", | |
" if len(character_text) in range(0,2000,100):\n", | |
" print('%s added to the dict, dict len: %s' %(character_name,str(len(character_text))))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment