Skip to content

Instantly share code, notes, and snippets.

@elazarg
Last active August 26, 2020 22:32
Show Gist options
  • Save elazarg/87566015c436c43f3a683725b89e5371 to your computer and use it in GitHub Desktop.
Save elazarg/87566015c436c43f3a683725b89e5371 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
from isoweek import Week
def fetch_table(year, week):
date = Week(year, week).sunday().strftime("%d/%m/%Y")
url = 'http://212.150.52.211/scripts/rating10.asp'
response = requests.post(url, data={
'audience': '1',
'week': date,
'date_selector': 'weekly',
}, headers={
'Content-Type': 'application/x-www-form-urlencoded'
})
return response.content
def reverse_hebrew(s):
return ' '.join(reversed([
x[::-1].replace('ה-08', 'ה-80') if any(c in x for c in 'אבגדהוזחטיכלמנסעפצקרשת') else x
for x in s.split()
]))
def parse(html):
bs = BeautifulSoup(html, features="lxml")
for tr in bs.find_all('tr'):
tds = tr.find_all('td')
if len(tds) != 9:
continue
values = [reverse_hebrew(x.get_text().replace('\n', '').strip()) for x in tds]
percent, thousands, rating, duration, day, date, channel, name, rank = values
if not rank.isdigit():
continue
yield (rank, name, channel, date, day, duration, rating, thousands, percent)
if __name__ == '__main__':
with open('rating.csv', 'w', encoding='utf8') as f:
for year in range(2008, 2020):
numweeks = 52 if year < 2020 else 34
for week in range(numweeks):
print(year, week, end='\r', flush=True)
html = fetch_table(year, week)
for rank, *row in parse(html):
print(year, week, *row, sep=',', file=f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment