jairopinilla/imbdGetGenres.py

## imbdGetGenres.py
from imdb import IMDb
import re
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import requests
import time
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

driver = webdriver.Chrome('chromedriver.exe')
ia = IMDb()
#ListaContenido array with the names of the movies
#x[1][0] position of the name
Contenidosclasificacion=[]
for x in ListaContenido.iterrows():
    nombreRecursorigin=x[1][0]

    try:
        nombreRecurso=nombreRecursorigin.replace(' ','+')
        driver.get('https://www.imdb.com/find?s=tt&q='+nombreRecurso+'&ref_=nv_sr_sm')
        button = driver.find_element_by_xpath('//*[@id="main"]/div/div[2]/table/tbody/tr[1]/td[2]/a')
        button.click()
        ListaGeneros=driver.find_elements_by_xpath('//*[@id="titleStoryLine"]/div[4]/a')
        url=str(driver.current_url)
        #print(url)
        m = re.search('title/tt(.+?)/', url)
        #print(m)
        if m:
            found = m.group(1)
            #print(found)

        movie = ia.get_movie(found)
        generos=''

        for genre in movie['genres']:
            #print(genre)
            generos=generos + ',' + genre

        Contenidosclasificacion.append({'contenido:':nombreRecursorigin,'generos':generos, 'idimdb':found })
        print(nombreRecursorigin,generos)

    except:
        generos='SIN CLASIFICAR'
        Contenidosclasificacion.append({'contenido:':nombreRecursorigin,'generos':generos , 'idimdb':found })
        print(nombreRecursorigin,generos)

dataClas = pd.DataFrame(Contenidosclasificacion)
dataClas.head()
dataClas.to_excel("contenidoClasificado.xlsx")
#dataClas array with the name of the movies, genres and id of imbd
	from imdb import IMDb
	import re
	from urllib.request import urlopen as uReq
	from bs4 import BeautifulSoup as soup
	from datetime import datetime, timedelta
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	import time
	import requests
	import time
	import sys
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import statsmodels.api as sm
	import seaborn as sns

	driver = webdriver.Chrome('chromedriver.exe')
	ia = IMDb()
	#ListaContenido array with the names of the movies
	#x[1][0] position of the name
	Contenidosclasificacion=[]
	for x in ListaContenido.iterrows():
	nombreRecursorigin=x[1][0]

	try:
	nombreRecurso=nombreRecursorigin.replace(' ','+')
	driver.get('https://www.imdb.com/find?s=tt&q='+nombreRecurso+'&ref_=nv_sr_sm')
	button = driver.find_element_by_xpath('//*[@id="main"]/div/div[2]/table/tbody/tr[1]/td[2]/a')
	button.click()
	ListaGeneros=driver.find_elements_by_xpath('//*[@id="titleStoryLine"]/div[4]/a')
	url=str(driver.current_url)
	#print(url)
	m = re.search('title/tt(.+?)/', url)
	#print(m)
	if m:
	found = m.group(1)
	#print(found)

	movie = ia.get_movie(found)
	generos=''

	for genre in movie['genres']:
	#print(genre)
	generos=generos + ',' + genre

	Contenidosclasificacion.append({'contenido:':nombreRecursorigin,'generos':generos, 'idimdb':found })
	print(nombreRecursorigin,generos)

	except:
	generos='SIN CLASIFICAR'
	Contenidosclasificacion.append({'contenido:':nombreRecursorigin,'generos':generos , 'idimdb':found })
	print(nombreRecursorigin,generos)

	dataClas = pd.DataFrame(Contenidosclasificacion)
	dataClas.head()
	dataClas.to_excel("contenidoClasificado.xlsx")
	#dataClas array with the name of the movies, genres and id of imbd