GlinZachariah/wiki_movie_generator_data_generator.py

## wiki_movie_generator_data_generator.py
'''
Generate movie data as json from Wikipedia
'''
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import re
import json


#url of the page we want to scrape
class MovieDataFetcher:

    url='';
    name='';
    html='';
    page='';

    def __init__(self,urlX):
        global url;
        url = urlX
        print("Hello World");
        self.loadDriver();
        response = self.processPage();
        return response;


    def loadDriver(self):
        global url,html;
        print("Loading Drivers...");
        # url = "https://www.wikidata.org/wiki/Q73028"

        # initiating the webdriver. Parameter includes the path of the webdriver.
        driver = webdriver.Chrome('./chromedriver')
        driver.get(url)


        # this is just to ensure that the page is loaded
        time.sleep(5)


        html = driver.page_source
        driver.close();
        print("Page fetching..");


    def processPage(self):
        time.sleep(2)
        print("Processing page..");
        global html,name,page;
        page = requests.get(url);
        soup = BeautifulSoup(page.content, "html.parser")
        # taking the page content directly
        title_res = soup.find(id="content")
        title = title_res.find_all("span",class_="wikibase-title-label")
        print("Movie :: "+title[0].text)
        results = soup.find(id="mw-content-text")
        # wikidata is split as list of key value pair based on wikidata properties
        propGrps = results.find_all("div",class_="wikibase-statementgroupview")
        td_tag_list =[];
        # iterating through each property
        for prop in propGrps:
            temp = prop.find_all("a")
            temp_list=[];
            # each property contains first value as property name and remaining as values
            for i in range(0,len (temp),1):
                temp_val =temp[i].text;
                # some of the data present is invalid and only required for Wikipedia
                if re.search(r'(.*?)Wikimedia(.*?)|(.+?)Wikipedia|inferred from|retrieved|(.*?)URL|Category:(.*?)',temp_val) is  None:
                    temp_list.append(temp_val);
            # combining all properties to a list
            td_tag_list.append(temp_list);

        dicts = {}
        for i in range(0,len(td_tag_list),1):
            key = td_tag_list[i].pop(0);
            value = td_tag_list[i];
            dicts[key] = value;

        # print(dicts)
        jsonStr = json.dumps(dicts,ensure_ascii=False)
        print(jsonStr)

        with open(title[0].text.replace(' ','_')+'.json', 'w', encoding='utf-8') as f:
            json.dump(dicts, f, ensure_ascii=False)
            print("Saved  successfully in "+title[0].text.replace(' ','_')+'.json')
	'''
	Generate movie data as json from Wikipedia
	'''
	import requests
	from bs4 import BeautifulSoup
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	import time
	import re
	import json


	#url of the page we want to scrape
	class MovieDataFetcher:

	url='';
	name='';
	html='';
	page='';

	def __init__(self,urlX):
	global url;
	url = urlX
	print("Hello World");
	self.loadDriver();
	response = self.processPage();
	return response;


	def loadDriver(self):
	global url,html;
	print("Loading Drivers...");
	# url = "https://www.wikidata.org/wiki/Q73028"

	# initiating the webdriver. Parameter includes the path of the webdriver.
	driver = webdriver.Chrome('./chromedriver')
	driver.get(url)


	# this is just to ensure that the page is loaded
	time.sleep(5)


	html = driver.page_source
	driver.close();
	print("Page fetching..");


	def processPage(self):
	time.sleep(2)
	print("Processing page..");
	global html,name,page;
	page = requests.get(url);
	soup = BeautifulSoup(page.content, "html.parser")
	# taking the page content directly
	title_res = soup.find(id="content")
	title = title_res.find_all("span",class_="wikibase-title-label")
	print("Movie :: "+title[0].text)
	results = soup.find(id="mw-content-text")
	# wikidata is split as list of key value pair based on wikidata properties
	propGrps = results.find_all("div",class_="wikibase-statementgroupview")
	td_tag_list =[];
	# iterating through each property
	for prop in propGrps:
	temp = prop.find_all("a")
	temp_list=[];
	# each property contains first value as property name and remaining as values
	for i in range(0,len (temp),1):
	temp_val =temp[i].text;
	# some of the data present is invalid and only required for Wikipedia
	if re.search(r'(.?)Wikimedia(.?)\|(.+?)Wikipedia\|inferred from\|retrieved\|(.?)URL\|Category:(.?)',temp_val) is None:
	temp_list.append(temp_val);
	# combining all properties to a list
	td_tag_list.append(temp_list);

	dicts = {}
	for i in range(0,len(td_tag_list),1):
	key = td_tag_list[i].pop(0);
	value = td_tag_list[i];
	dicts[key] = value;

	# print(dicts)
	jsonStr = json.dumps(dicts,ensure_ascii=False)
	print(jsonStr)

	with open(title[0].text.replace(' ','_')+'.json', 'w', encoding='utf-8') as f:
	json.dump(dicts, f, ensure_ascii=False)
	print("Saved successfully in "+title[0].text.replace(' ','_')+'.json')