aloncohen1/scrape_lotr_data.ipynb

## scrape_lotr_data.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "result = requests.get(\"https://lotr.fandom.com/sitemap-newsitemapxml-index.xml\")\n",
    "c = result.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-2-35626.xml\n",
      "https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-35626-49264.xml\n",
      "https://lotr.fandom.com/sitemap-newsitemapxml-NS_14-id-21-49203.xml\n",
      "https://services.fandom.com/discussions-sitemap/sitemap/159\n"
     ]
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup \n",
    "import lxml\n",
    "import xml.etree.ElementTree as ET\n",
    "root = ET.fromstring(c)\n",
    "for link in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n",
    "    print(link.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 7080 elements\n"
     ]
    }
   ],
   "source": [
    "elements = dict()\n",
    "for page in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n",
    "    result = requests.get(page.text)\n",
    "    c = result.content\n",
    "    new_root = ET.fromstring(c)\n",
    "    for element in new_root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n",
    "        elements[element.text.split('/')[-1]] = element.text\n",
    "print('Found {} elements'.format(len(elements)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://lotr.fandom.com/wiki/Carolynne_Cunningham'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "elements['Carolynne_Cunningham']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hallatan added to the dict, dict len: 300\n",
      "Prisca_(Baggins)_Bolger added to the dict, dict len: 600\n",
      "Barahir_(Fourth_Age) added to the dict, dict len: 900\n"
     ]
    }
   ],
   "source": [
    "characters_dict = {}\n",
    "counter=0\n",
    "for k,v in elements.items():\n",
    "    result = requests.get(v)\n",
    "    c = result.content\n",
    "    soup = BeautifulSoup(c, \"html.parser\")  # parse HTML page \n",
    "    links = soup.find_all(\"div\", class_='page-header__categories-links')\n",
    "    try:\n",
    "        if '/wiki/Category:Characters' in str(links[0]):\n",
    "            characters_dict[k] = v\n",
    "            counter+=1\n",
    "            if counter in range(0,10000,300):\n",
    "                print('%s added to the dict, dict len: %s' %(k,str(len(characters_dict))))\n",
    "    except:\n",
    "        continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1112"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "category_dict={}\n",
    "for k,v in characters_dict.items():\n",
    "    if 'Category:' in k:\n",
    "        category_dict[k]=v\n",
    "        del characters_dict[k]\n",
    "len(characters_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'https://lotr.fandom.com/wiki/Adaldrida_(Bolger)_Brandybuck': 'Adaldrida_(Bolger)_Brandybuck',\n",
       " 'https://lotr.fandom.com/wiki/N%C3%A1li': 'N%C3%A1li',\n",
       " 'https://lotr.fandom.com/wiki/Ondoher': 'Ondoher',\n",
       " 'https://lotr.fandom.com/wiki/Thorin_II_Oakenshield': 'Thorin_II_Oakenshield'}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import itertools\n",
    "inverted_dict = {v.encode('utf-8'): k for k, v in characters_dict.iteritems()}\n",
    "dict(itertools.islice(inverted_dict.items(), 0, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Elendur added to the dict, dict len: 100\n",
      "Borlad added to the dict, dict len: 200\n",
      "Sigismond_Took added to the dict, dict len: 300\n",
      "M%C3%ADriel added to the dict, dict len: 400\n",
      "Idril added to the dict, dict len: 500\n",
      "Dagor_(sergeant) added to the dict, dict len: 600\n",
      "Baranor_(Gondor) added to the dict, dict len: 700\n",
      "Marhari added to the dict, dict len: 800\n",
      "Laura_(Grubb)_Baggins added to the dict, dict len: 900\n",
      "Angelica_Baggins added to the dict, dict len: 1000\n",
      "Arador added to the dict, dict len: 1100\n"
     ]
    }
   ],
   "source": [
    "import collections\n",
    "from lxml import html\n",
    "import pandas as pd\n",
    "character_count=collections.Counter()\n",
    "character_text=dict()\n",
    "character_network = collections.Counter()\n",
    "base_url = 'https://lotr.fandom.com'\n",
    "\n",
    "characters_details=[]\n",
    "\n",
    "for character_url, character_name in inverted_dict.items():\n",
    "    character_html = requests.get(character_url)\n",
    "    page = html.fromstring(character_html.text)\n",
    "    soup = BeautifulSoup(character_html.content, \"html.parser\")\n",
    "        \n",
    "    lotr_dict={'character_name':character_name,'character_url': character_url}\n",
    "    lotr_name = soup.find(\"h2\", class_=\"pi-item pi-item-spacing pi-title\")\n",
    "    if lotr_name:\n",
    "        lotr_dict['lotr_name']=lotr_name.contents[0]\n",
    "    for lotr in soup.find_all(\"div\", class_=\"pi-item\"):\n",
    "        lotr_data = lotr.text.split('\\n')\n",
    "        lotr_dict[lotr_data[1].strip().lower()] = lotr_data[2].strip().lower()\n",
    "    characters_details.append(lotr_dict)\n",
    "        \n",
    "    character = {'name':character_name, 'url':character_url, 'html_content':character_html.text }    \n",
    "    character['text'] = soup.find('div',{'id':'mw-content-text'}).text \n",
    "    character['links'] = collections.Counter([inverted_dict[base_url+link.get('href').encode('utf-8')] for link in soup.find('div',{'id':'mw-content-text'}).find_all('a') if link.get('href') and (base_url+str(link.get('href').encode('utf-8'))) in inverted_dict])\n",
    "    character_count=character_count + character['links']\n",
    "    character_text[character_name]=soup.find(\"div\",{\"id\":\"mw-content-text\"}).text\n",
    "    for x in character:\n",
    "        x = character_name\n",
    "        for y in character['links']:\n",
    "            character_network[(x, y)] = character['links'][y]\n",
    "    if len(character_text) in range(0,2000,100):\n",
    "        print('%s added to the dict, dict len: %s' %(character_name,str(len(character_text))))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import requests\n",
	"result = requests.get(\"https://lotr.fandom.com/sitemap-newsitemapxml-index.xml\")\n",
	"c = result.content"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-2-35626.xml\n",
	"https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-35626-49264.xml\n",
	"https://lotr.fandom.com/sitemap-newsitemapxml-NS_14-id-21-49203.xml\n",
	"https://services.fandom.com/discussions-sitemap/sitemap/159\n"
	]
	}
	],
	"source": [
	"from bs4 import BeautifulSoup \n",
	"import lxml\n",
	"import xml.etree.ElementTree as ET\n",
	"root = ET.fromstring(c)\n",
	"for link in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n",
	" print(link.text)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Found 7080 elements\n"
	]
	}
	],
	"source": [
	"elements = dict()\n",
	"for page in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n",
	" result = requests.get(page.text)\n",
	" c = result.content\n",
	" new_root = ET.fromstring(c)\n",
	" for element in new_root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):\n",
	" elements[element.text.split('/')[-1]] = element.text\n",
	"print('Found {} elements'.format(len(elements)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'https://lotr.fandom.com/wiki/Carolynne_Cunningham'"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"elements['Carolynne_Cunningham']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Hallatan added to the dict, dict len: 300\n",
	"Prisca_(Baggins)_Bolger added to the dict, dict len: 600\n",
	"Barahir_(Fourth_Age) added to the dict, dict len: 900\n"
	]
	}
	],
	"source": [
	"characters_dict = {}\n",
	"counter=0\n",
	"for k,v in elements.items():\n",
	" result = requests.get(v)\n",
	" c = result.content\n",
	" soup = BeautifulSoup(c, \"html.parser\") # parse HTML page \n",
	" links = soup.find_all(\"div\", class_='page-header__categories-links')\n",
	" try:\n",
	" if '/wiki/Category:Characters' in str(links[0]):\n",
	" characters_dict[k] = v\n",
	" counter+=1\n",
	" if counter in range(0,10000,300):\n",
	" print('%s added to the dict, dict len: %s' %(k,str(len(characters_dict))))\n",
	" except:\n",
	" continue"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1112"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"category_dict={}\n",
	"for k,v in characters_dict.items():\n",
	" if 'Category:' in k:\n",
	" category_dict[k]=v\n",
	" del characters_dict[k]\n",
	"len(characters_dict)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'https://lotr.fandom.com/wiki/Adaldrida_(Bolger)_Brandybuck': 'Adaldrida_(Bolger)_Brandybuck',\n",
	" 'https://lotr.fandom.com/wiki/N%C3%A1li': 'N%C3%A1li',\n",
	" 'https://lotr.fandom.com/wiki/Ondoher': 'Ondoher',\n",
	" 'https://lotr.fandom.com/wiki/Thorin_II_Oakenshield': 'Thorin_II_Oakenshield'}"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import itertools\n",
	"inverted_dict = {v.encode('utf-8'): k for k, v in characters_dict.iteritems()}\n",
	"dict(itertools.islice(inverted_dict.items(), 0, 4))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Elendur added to the dict, dict len: 100\n",
	"Borlad added to the dict, dict len: 200\n",
	"Sigismond_Took added to the dict, dict len: 300\n",
	"M%C3%ADriel added to the dict, dict len: 400\n",
	"Idril added to the dict, dict len: 500\n",
	"Dagor_(sergeant) added to the dict, dict len: 600\n",
	"Baranor_(Gondor) added to the dict, dict len: 700\n",
	"Marhari added to the dict, dict len: 800\n",
	"Laura_(Grubb)_Baggins added to the dict, dict len: 900\n",
	"Angelica_Baggins added to the dict, dict len: 1000\n",
	"Arador added to the dict, dict len: 1100\n"
	]
	}
	],
	"source": [
	"import collections\n",
	"from lxml import html\n",
	"import pandas as pd\n",
	"character_count=collections.Counter()\n",
	"character_text=dict()\n",
	"character_network = collections.Counter()\n",
	"base_url = 'https://lotr.fandom.com'\n",
	"\n",
	"characters_details=[]\n",
	"\n",
	"for character_url, character_name in inverted_dict.items():\n",
	" character_html = requests.get(character_url)\n",
	" page = html.fromstring(character_html.text)\n",
	" soup = BeautifulSoup(character_html.content, \"html.parser\")\n",
	" \n",
	" lotr_dict={'character_name':character_name,'character_url': character_url}\n",
	" lotr_name = soup.find(\"h2\", class_=\"pi-item pi-item-spacing pi-title\")\n",
	" if lotr_name:\n",
	" lotr_dict['lotr_name']=lotr_name.contents[0]\n",
	" for lotr in soup.find_all(\"div\", class_=\"pi-item\"):\n",
	" lotr_data = lotr.text.split('\\n')\n",
	" lotr_dict[lotr_data[1].strip().lower()] = lotr_data[2].strip().lower()\n",
	" characters_details.append(lotr_dict)\n",
	" \n",
	" character = {'name':character_name, 'url':character_url, 'html_content':character_html.text } \n",
	" character['text'] = soup.find('div',{'id':'mw-content-text'}).text \n",
	" character['links'] = collections.Counter([inverted_dict[base_url+link.get('href').encode('utf-8')] for link in soup.find('div',{'id':'mw-content-text'}).find_all('a') if link.get('href') and (base_url+str(link.get('href').encode('utf-8'))) in inverted_dict])\n",
	" character_count=character_count + character['links']\n",
	" character_text[character_name]=soup.find(\"div\",{\"id\":\"mw-content-text\"}).text\n",
	" for x in character:\n",
	" x = character_name\n",
	" for y in character['links']:\n",
	" character_network[(x, y)] = character['links'][y]\n",
	" if len(character_text) in range(0,2000,100):\n",
	" print('%s added to the dict, dict len: %s' %(character_name,str(len(character_text))))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}