Skip to content

Instantly share code, notes, and snippets.

@sergiolucero
Created March 30, 2019 13:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergiolucero/304af786375e3ffd80df33171b7c9fe1 to your computer and use it in GitHub Desktop.
Save sergiolucero/304af786375e3ffd80df33171b7c9fe1 to your computer and use it in GitHub Desktop.
television scraper
import os, requests, time
import sqlite3, pandas as pd
from bs4 import BeautifulSoup
STATIC_FOLDER = os.getenv('STATIC_FOLDER')
CANALES = ['rec-tv','canal-13-c','mega','canal-13','tvn','chilevision','ucv-tv',
'tnt','la-red','fox','tnt','hbo','sony','axn','warner','a-e-mundo',
'universal-channel','space','fx']
# 'tvn-24h'
def get_date(channel, date):
url = f'https://mi.tv/cl/async/channel/{channel}/{date}/-180'
bs = BeautifulSoup(requests.get(url).text,'lxml')
contents = bs.find_all('div',attrs={'class':'content'})
# factor into a function (span.sub-title,span.time.text,h2.text.strip)
episodes = [c0.find_next('span',attrs={'class':'sub-title'}).text
for c0 in contents]
times = [c0.find_next('span',attrs={'class':'time'}).text
for c0 in contents]
titulos = [c0.find_next('h2').text.strip() for c0 in contents]
df = pd.DataFrame(dict(hora=times,episodio=episodes,programa=titulos))
df['fecha'] = date
#print(date,len(df))
df = df[['fecha','hora','programa','episodio']]
return df
########################
def channel_scraper(channel,dates):
df = pd.concat([get_date(channel,date) for date in dates])
df['canal'] = channel
return df
#####
if __name__ == '__main__':
MESES = [10,11,12]
t0 = time.time()
df = pd.DataFrame()
for c in CANALES:
for m in MESES:
dates = ['2018-%02d-%02d' %(m,d) for d in range(1,32)]
df = df.append(channel_scraper(c,dates))
dt = time.time()-t0
print(c,m,len(df),round(len(df)/dt,2))
df.to_sql('parrilla', sqlite3.connect('television.db'), if_exists='replace')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment