sergiolucero/tv_scraper.py

## tv_scraper.py
import os, requests, time
import sqlite3, pandas as pd
from bs4 import BeautifulSoup

STATIC_FOLDER = os.getenv('STATIC_FOLDER')
CANALES = ['rec-tv','canal-13-c','mega','canal-13','tvn','chilevision','ucv-tv',
            'tnt','la-red','fox','tnt','hbo','sony','axn','warner','a-e-mundo',
            'universal-channel','space','fx']
#        'tvn-24h'

def get_date(channel, date):
    url = f'https://mi.tv/cl/async/channel/{channel}/{date}/-180'
    bs = BeautifulSoup(requests.get(url).text,'lxml')
    contents = bs.find_all('div',attrs={'class':'content'})

    # factor into a function  (span.sub-title,span.time.text,h2.text.strip)
    episodes = [c0.find_next('span',attrs={'class':'sub-title'}).text
                for c0 in contents]
    times = [c0.find_next('span',attrs={'class':'time'}).text
                for c0 in contents]
    titulos = [c0.find_next('h2').text.strip() for c0 in contents]

    df = pd.DataFrame(dict(hora=times,episodio=episodes,programa=titulos))
    df['fecha'] = date
    #print(date,len(df))
    df = df[['fecha','hora','programa','episodio']]

    return df
########################
def channel_scraper(channel,dates):
    df = pd.concat([get_date(channel,date) for date in dates])
    df['canal'] = channel
    return df

#####
if __name__ == '__main__':
    MESES = [10,11,12]
    t0 = time.time()
    df = pd.DataFrame()
    for c in CANALES:
        for m in MESES:
            dates = ['2018-%02d-%02d' %(m,d) for d in range(1,32)]
            df = df.append(channel_scraper(c,dates))
            dt = time.time()-t0
            print(c,m,len(df),round(len(df)/dt,2))

    df.to_sql('parrilla', sqlite3.connect('television.db'), if_exists='replace')
	import os, requests, time
	import sqlite3, pandas as pd
	from bs4 import BeautifulSoup

	STATIC_FOLDER = os.getenv('STATIC_FOLDER')
	CANALES = ['rec-tv','canal-13-c','mega','canal-13','tvn','chilevision','ucv-tv',
	'tnt','la-red','fox','tnt','hbo','sony','axn','warner','a-e-mundo',
	'universal-channel','space','fx']
	# 'tvn-24h'

	def get_date(channel, date):
	url = f'https://mi.tv/cl/async/channel/{channel}/{date}/-180'
	bs = BeautifulSoup(requests.get(url).text,'lxml')
	contents = bs.find_all('div',attrs={'class':'content'})

	# factor into a function (span.sub-title,span.time.text,h2.text.strip)
	episodes = [c0.find_next('span',attrs={'class':'sub-title'}).text
	for c0 in contents]
	times = [c0.find_next('span',attrs={'class':'time'}).text
	for c0 in contents]
	titulos = [c0.find_next('h2').text.strip() for c0 in contents]

	df = pd.DataFrame(dict(hora=times,episodio=episodes,programa=titulos))
	df['fecha'] = date
	#print(date,len(df))
	df = df[['fecha','hora','programa','episodio']]

	return df
	########################
	def channel_scraper(channel,dates):
	df = pd.concat([get_date(channel,date) for date in dates])
	df['canal'] = channel
	return df

	#####
	if __name__ == '__main__':
	MESES = [10,11,12]
	t0 = time.time()
	df = pd.DataFrame()
	for c in CANALES:
	for m in MESES:
	dates = ['2018-%02d-%02d' %(m,d) for d in range(1,32)]
	df = df.append(channel_scraper(c,dates))
	dt = time.time()-t0
	print(c,m,len(df),round(len(df)/dt,2))

	df.to_sql('parrilla', sqlite3.connect('television.db'), if_exists='replace')