Skip to content

Instantly share code, notes, and snippets.

@aloncohen1
Created November 17, 2019 23:09
Show Gist options
  • Save aloncohen1/b4605d3fbfd7476e1ba37cc239d54946 to your computer and use it in GitHub Desktop.
Save aloncohen1/b4605d3fbfd7476e1ba37cc239d54946 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"result = requests.get(\"https://lotr.fandom.com/sitemap-newsitemapxml-index.xml\")\n",
"c = result.content"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-2-35626.xml\n",
"https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-35626-49264.xml\n",
"https://lotr.fandom.com/sitemap-newsitemapxml-NS_14-id-21-49203.xml\n",
"https://services.fandom.com/discussions-sitemap/sitemap/159\n"
]
}
],
"source": [
"from bs4 import BeautifulSoup \n",
"import lxml\n",
"import xml.etree.ElementTree as ET\n",
"root = ET.fromstring(c)\n",
"for link in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n",
" print(link.text)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 7080 elements\n"
]
}
],
"source": [
"elements = dict()\n",
"for page in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n",
" result = requests.get(page.text)\n",
" c = result.content\n",
" new_root = ET.fromstring(c)\n",
" for element in new_root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n",
" elements[element.text.split('/')[-1]] = element.text\n",
"print('Found {} elements'.format(len(elements)))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://lotr.fandom.com/wiki/Carolynne_Cunningham'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"elements['Carolynne_Cunningham']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Hallatan added to the dict, dict len: 300\n",
"Prisca_(Baggins)_Bolger added to the dict, dict len: 600\n",
"Barahir_(Fourth_Age) added to the dict, dict len: 900\n"
]
}
],
"source": [
"characters_dict = {}\n",
"counter=0\n",
"for k,v in elements.items():\n",
" result = requests.get(v)\n",
" c = result.content\n",
" soup = BeautifulSoup(c, \"html.parser\") # parse HTML page \n",
" links = soup.find_all(\"div\", class_='page-header__categories-links')\n",
" try:\n",
" if '/wiki/Category:Characters' in str(links[0]):\n",
" characters_dict[k] = v\n",
" counter+=1\n",
" if counter in range(0,10000,300):\n",
" print('%s added to the dict, dict len: %s' %(k,str(len(characters_dict))))\n",
" except:\n",
" continue"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1112"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"category_dict={}\n",
"for k,v in characters_dict.items():\n",
" if 'Category:' in k:\n",
" category_dict[k]=v\n",
" del characters_dict[k]\n",
"len(characters_dict)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"{'https://lotr.fandom.com/wiki/Adaldrida_(Bolger)_Brandybuck': 'Adaldrida_(Bolger)_Brandybuck',\n",
" 'https://lotr.fandom.com/wiki/N%C3%A1li': 'N%C3%A1li',\n",
" 'https://lotr.fandom.com/wiki/Ondoher': 'Ondoher',\n",
" 'https://lotr.fandom.com/wiki/Thorin_II_Oakenshield': 'Thorin_II_Oakenshield'}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import itertools\n",
"inverted_dict = {v.encode('utf-8'): k for k, v in characters_dict.iteritems()}\n",
"dict(itertools.islice(inverted_dict.items(), 0, 4))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Elendur added to the dict, dict len: 100\n",
"Borlad added to the dict, dict len: 200\n",
"Sigismond_Took added to the dict, dict len: 300\n",
"M%C3%ADriel added to the dict, dict len: 400\n",
"Idril added to the dict, dict len: 500\n",
"Dagor_(sergeant) added to the dict, dict len: 600\n",
"Baranor_(Gondor) added to the dict, dict len: 700\n",
"Marhari added to the dict, dict len: 800\n",
"Laura_(Grubb)_Baggins added to the dict, dict len: 900\n",
"Angelica_Baggins added to the dict, dict len: 1000\n",
"Arador added to the dict, dict len: 1100\n"
]
}
],
"source": [
"import collections\n",
"from lxml import html\n",
"import pandas as pd\n",
"character_count=collections.Counter()\n",
"character_text=dict()\n",
"character_network = collections.Counter()\n",
"base_url = 'https://lotr.fandom.com'\n",
"\n",
"characters_details=[]\n",
"\n",
"for character_url, character_name in inverted_dict.items():\n",
" character_html = requests.get(character_url)\n",
" page = html.fromstring(character_html.text)\n",
" soup = BeautifulSoup(character_html.content, \"html.parser\")\n",
" \n",
" lotr_dict={'character_name':character_name,'character_url': character_url}\n",
" lotr_name = soup.find(\"h2\", class_=\"pi-item pi-item-spacing pi-title\")\n",
" if lotr_name:\n",
" lotr_dict['lotr_name']=lotr_name.contents[0]\n",
" for lotr in soup.find_all(\"div\", class_=\"pi-item\"):\n",
" lotr_data = lotr.text.split('\\n')\n",
" lotr_dict[lotr_data[1].strip().lower()] = lotr_data[2].strip().lower()\n",
" characters_details.append(lotr_dict)\n",
" \n",
" character = {'name':character_name, 'url':character_url, 'html_content':character_html.text } \n",
" character['text'] = soup.find('div',{'id':'mw-content-text'}).text \n",
" character['links'] = collections.Counter([inverted_dict[base_url+link.get('href').encode('utf-8')] for link in soup.find('div',{'id':'mw-content-text'}).find_all('a') if link.get('href') and (base_url+str(link.get('href').encode('utf-8'))) in inverted_dict])\n",
" character_count=character_count + character['links']\n",
" character_text[character_name]=soup.find(\"div\",{\"id\":\"mw-content-text\"}).text\n",
" for x in character:\n",
" x = character_name\n",
" for y in character['links']:\n",
" character_network[(x, y)] = character['links'][y]\n",
" if len(character_text) in range(0,2000,100):\n",
" print('%s added to the dict, dict len: %s' %(character_name,str(len(character_text))))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment