Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save GlinZachariah/dd97c1b0d83397548a7b40a07cb183d0 to your computer and use it in GitHub Desktop.
Save GlinZachariah/dd97c1b0d83397548a7b40a07cb183d0 to your computer and use it in GitHub Desktop.
Generate movie data as json from Wikipedia (Base Layout) further improvements are welcome.
'''
Generate movie data as json from Wikipedia
'''
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import re
import json
#url of the page we want to scrape
class MovieDataFetcher:
url='';
name='';
html='';
page='';
def __init__(self,urlX):
global url;
url = urlX
print("Hello World");
self.loadDriver();
response = self.processPage();
return response;
def loadDriver(self):
global url,html;
print("Loading Drivers...");
# url = "https://www.wikidata.org/wiki/Q73028"
# initiating the webdriver. Parameter includes the path of the webdriver.
driver = webdriver.Chrome('./chromedriver')
driver.get(url)
# this is just to ensure that the page is loaded
time.sleep(5)
html = driver.page_source
driver.close();
print("Page fetching..");
def processPage(self):
time.sleep(2)
print("Processing page..");
global html,name,page;
page = requests.get(url);
soup = BeautifulSoup(page.content, "html.parser")
# taking the page content directly
title_res = soup.find(id="content")
title = title_res.find_all("span",class_="wikibase-title-label")
print("Movie :: "+title[0].text)
results = soup.find(id="mw-content-text")
# wikidata is split as list of key value pair based on wikidata properties
propGrps = results.find_all("div",class_="wikibase-statementgroupview")
td_tag_list =[];
# iterating through each property
for prop in propGrps:
temp = prop.find_all("a")
temp_list=[];
# each property contains first value as property name and remaining as values
for i in range(0,len (temp),1):
temp_val =temp[i].text;
# some of the data present is invalid and only required for Wikipedia
if re.search(r'(.*?)Wikimedia(.*?)|(.+?)Wikipedia|inferred from|retrieved|(.*?)URL|Category:(.*?)',temp_val) is None:
temp_list.append(temp_val);
# combining all properties to a list
td_tag_list.append(temp_list);
dicts = {}
for i in range(0,len(td_tag_list),1):
key = td_tag_list[i].pop(0);
value = td_tag_list[i];
dicts[key] = value;
# print(dicts)
jsonStr = json.dumps(dicts,ensure_ascii=False)
print(jsonStr)
with open(title[0].text.replace(' ','_')+'.json', 'w', encoding='utf-8') as f:
json.dump(dicts, f, ensure_ascii=False)
print("Saved successfully in "+title[0].text.replace(' ','_')+'.json')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment