Last active
May 31, 2020 19:27
-
-
Save jairopinilla/994f274f0821a7fff059905ce63f1337 to your computer and use it in GitHub Desktop.
Code to get genres of imbd by tittle of the movie.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from imdb import IMDb | |
import re | |
from urllib.request import urlopen as uReq | |
from bs4 import BeautifulSoup as soup | |
from datetime import datetime, timedelta | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
import time | |
import requests | |
import time | |
import sys | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import statsmodels.api as sm | |
import seaborn as sns | |
driver = webdriver.Chrome('chromedriver.exe') | |
ia = IMDb() | |
#ListaContenido array with the names of the movies | |
#x[1][0] position of the name | |
Contenidosclasificacion=[] | |
for x in ListaContenido.iterrows(): | |
nombreRecursorigin=x[1][0] | |
try: | |
nombreRecurso=nombreRecursorigin.replace(' ','+') | |
driver.get('https://www.imdb.com/find?s=tt&q='+nombreRecurso+'&ref_=nv_sr_sm') | |
button = driver.find_element_by_xpath('//*[@id="main"]/div/div[2]/table/tbody/tr[1]/td[2]/a') | |
button.click() | |
ListaGeneros=driver.find_elements_by_xpath('//*[@id="titleStoryLine"]/div[4]/a') | |
url=str(driver.current_url) | |
#print(url) | |
m = re.search('title/tt(.+?)/', url) | |
#print(m) | |
if m: | |
found = m.group(1) | |
#print(found) | |
movie = ia.get_movie(found) | |
generos='' | |
for genre in movie['genres']: | |
#print(genre) | |
generos=generos + ',' + genre | |
Contenidosclasificacion.append({'contenido:':nombreRecursorigin,'generos':generos, 'idimdb':found }) | |
print(nombreRecursorigin,generos) | |
except: | |
generos='SIN CLASIFICAR' | |
Contenidosclasificacion.append({'contenido:':nombreRecursorigin,'generos':generos , 'idimdb':found }) | |
print(nombreRecursorigin,generos) | |
dataClas = pd.DataFrame(Contenidosclasificacion) | |
dataClas.head() | |
dataClas.to_excel("contenidoClasificado.xlsx") | |
#dataClas array with the name of the movies, genres and id of imbd |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
References: https://imdbpy.readthedocs.io/en/latest/