-
-
Save adeak/220f1994e5ed5fa23f599c74017b257e to your computer and use it in GitHub Desktop.
starscraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Jerry Coffin : 1 | |
Jason : 1 | |
shuttle87 : 1 | |
zounds : 1 | |
Phillip Cloud : 1 | |
BlackSheep : 1 | |
noumenal : 1 | |
Anaphory : 1 | |
enginefree : 1 | |
Brandon Lipman : 1 | |
Undo : 1 | |
Moinuddin Quadri : 1 | |
coltonoscopy : 1 | |
Danilo : 1 | |
ExoticBirdsMerchant : 1 | |
Dominico909 : 1 | |
Damon : 1 | |
Registered User : 1 | |
TemporalWolf : 1 | |
jjj : 1 | |
mkingsbu : 1 | |
Rainer Koirikivi : 1 | |
Abdullah UYU : 1 | |
CoderDude Twodee : 1 | |
Joe : 1 | |
MichaelHCameron : 1 | |
gsw945 : 1 | |
heather : 1 | |
user6845426 : 1 | |
Orange : 1 | |
TheExorcist : 1 | |
apronedsamurai : 1 | |
Pizza lord : 1 | |
RompePC : 1 | |
Keyur Potdar : 1 | |
Mr.Zeus : 1 | |
Acrimonious Mirth : 1 | |
Ignacio Vazquez-Abrams : 2 | |
Oliver : 2 | |
Mirko Cianfarani : 2 | |
Ajit : 2 | |
Joe : 2 | |
Stefan Pochmann : 2 | |
Brandin : 2 | |
user1743752 : 2 | |
Sohaib Asif : 2 | |
Sankar Raj : 2 | |
Jovito : 2 | |
Steven Vascellaro : 2 | |
user3504751 : 2 | |
Tomas Zubiri : 2 | |
Hexacoordinate-C : 2 | |
gloriousCatnip : 2 | |
Meet Taraviya : 2 | |
Tristen : 2 | |
Nathan777 : 2 | |
Tim Post : 3 | |
John Y : 3 | |
Xavier Combelle : 3 | |
Bibhas : 3 | |
Danack : 3 | |
Sudipta : 3 | |
HamZa : 3 | |
lciamp : 3 | |
ddelemeny : 3 | |
Noob Saibot : 3 | |
Bonifacio2 : 3 | |
QPaysTaxes : 3 | |
Mirac7 : 3 | |
isedev : 3 | |
TigerhawkT3 : 3 | |
ChillarAnand : 3 | |
Christian Ternus : 3 | |
Iplodman : 3 | |
jQuery Angry Bird : 3 | |
Sab : 3 | |
Augusta : 3 | |
AutomaticStatic : 3 | |
MYGz : 3 | |
Seraphim : 3 | |
Ananthu : 3 | |
Mark R. : 3 | |
Cam_Aust : 3 | |
faceless : 3 | |
Terry : 3 | |
CoderCat : 3 | |
Dan Lugg : 4 | |
wonderb0lt : 4 | |
tripleee : 4 | |
marxin : 4 | |
Angelo Tricarico : 4 | |
Mahesha999 : 4 | |
Jerry : 4 | |
Karin : 4 | |
Jivan : 4 | |
Vader : 4 | |
David Cullen : 4 | |
matsjoyce : 4 | |
Jfach : 4 | |
SuperBiasedMan : 4 | |
Dracunos : 4 | |
Anarach : 4 | |
Reno : 5 | |
Kneel-Before-ZOD : 5 | |
Hyperboreus : 5 | |
Reut Sharabani : 5 | |
gerrit : 5 | |
copy : 5 | |
Ahmad : 5 | |
Abhishek : 5 | |
Zacrath : 5 | |
Abhishek Bhatia : 5 | |
rp372 : 5 | |
iShaymus : 5 | |
zondo : 5 | |
Victoria : 5 | |
Simeon Aleksov : 5 | |
solarc : 6 | |
kush : 6 | |
Roman Luštrik : 6 | |
ypercube : 6 | |
Skyler : 6 | |
Volatility : 6 | |
Aruka J : 6 | |
K DawG : 6 | |
Matarata : 6 | |
Olivier Melançon : 6 | |
Sebastian Nielsen : 6 | |
A. Smoliak : 6 | |
BoltClock : 7 | |
rlemon : 7 | |
10 Replies : 7 | |
Bonstark : 7 | |
Daniel Roseman : 8 | |
PeeHaa : 8 | |
dead beef : 8 | |
Alex Thornton : 8 | |
aIKid : 8 | |
James Dean : 8 | |
Cody Piersall : 9 | |
Terfin : 9 | |
JVarhol : 9 | |
Reblochon Masque : 9 | |
kale : 9 | |
abhi : 10 | |
Badger Cat : 10 | |
iCodez : 10 | |
Jacque Goupil : 10 | |
Awal Garg : 10 | |
Patrick Maupin : 10 | |
TheSoundDefense : 10 | |
Tshepang : 11 | |
Florian Margaine : 11 | |
Daи : 11 | |
Haidro : 11 | |
Praveen Rawat : 11 | |
user4433485 : 11 | |
paul23 : 12 | |
Paolo Casciello : 12 | |
Rohit Barnwal : 12 | |
Wooble : 13 | |
alecxe : 13 | |
Zack Tarr : 13 | |
OneRaynyDay : 13 | |
Alexander Huszagh : 13 | |
Flexo : 14 | |
excaza : 14 | |
Carpetsmoker : 15 | |
Unihedron : 15 | |
tilaprimera : 16 | |
Programmer : 16 | |
Joran Beasley : 17 | |
Sterling Archer : 17 | |
Byte Commander : 17 | |
mgilson : 18 | |
roganjosh : 18 | |
Gemtastic : 19 | |
danidee : 20 | |
Simon : 22 | |
tzaman : 23 | |
Sword : 23 | |
Benjamin Gruenbaum : 24 | |
Arne : 25 | |
Aaron Hall : 26 | |
roippi : 26 | |
MattDMo : 28 | |
Andy K : 30 | |
Ilja Everilä : 31 | |
JGrindal : 33 | |
randomhopeful : 33 | |
Lev Levitsky : 36 | |
inspectorG4dget : 38 | |
Games Brainiac : 38 | |
Withnail : 39 | |
enderland : 40 | |
Adam Smith : 41 | |
ThiefMaster : 43 | |
toonarmycaptain : 44 | |
PaulMcG : 47 | |
user2357112 : 50 | |
Wayne Conrad : 63 | |
Feeds : 64 | |
JGreenwell : 71 | |
Ashish Nitin Patil : 75 | |
khajvah : 77 | |
bereal : 83 | |
jonrsharpe : 84 | |
Code-Apprentice : 86 | |
Air : 90 | |
cᴏʟᴅsᴘᴇᴇᴅ : 95 | |
thefourtheye : 98 | |
abarnert : 101 | |
Marcus Andrews : 104 | |
vaultah : 105 | |
piRSquared : 106 | |
Inbar Rose : 107 | |
MooingRawr : 113 | |
QuestionC : 149 | |
holdenweb : 150 | |
OldTinfoil : 152 | |
J Richard Snape : 156 | |
Bhargav Rao : 182 | |
Wayne Werner : 247 | |
Peter Varo : 254 | |
Zero Piraeus : 282 | |
Morgan Thrapp : 295 | |
corvid : 300 | |
Aran-Fey : 321 | |
Kevin M Granger : 430 | |
idjaw : 439 | |
Martijn Pieters : 468 | |
wim : 517 | |
poke : 533 | |
Robert Grant : 537 | |
Antti Haapala : 589 | |
Jon Clements : 626 | |
PM 2Ring : 630 | |
DSM : 759 | |
Andras Deak : 944 | |
davidism : 1029 | |
Ffisegydd : 1251 | |
user559633 : 1397 | |
Kevin : 3870 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup as BS | |
from datetime import datetime,timedelta | |
from operator import itemgetter | |
import json | |
roomID = 6 | |
def get_pagecount(roomID): | |
"""Get number of pages in paginated starboard""" | |
dat = requests.get(f'https://chat.stackoverflow.com/rooms/info/{roomID}?tab=stars&page=999999') # sorry | |
content = dat.text | |
soup = BS(content, 'html.parser') | |
pageinfo = soup.find('span', 'page-numbers current') | |
numpages = 1 if not pageinfo else int(pageinfo.text) | |
return numpages | |
def normalize_timestamp(timeinfo): | |
"""Parse flexible SO timestamps and return appropriate datetime object""" | |
# first try to parse last 24 hours: "1:04 PM" style | |
try: | |
time = datetime.strptime(timeinfo, '%I:%M %p').time() | |
date = datetime.today() | |
return datetime.combine(date, time) | |
except ValueError: | |
pass | |
# now try yesterday: "yst 10:23 PM" style | |
if timeinfo.startswith('yst'): | |
time = datetime.strptime(timeinfo[4:],'%I:%M %p').time() | |
date = datetime.today() - timedelta(days=1) | |
return datetime.combine(date, time) | |
# now try last week: "Mon 5:33 PM" style | |
try: | |
names_of_days = [datetime.strftime(datetime(1,1,1) + timedelta(days=k), '%a') for k in range(7)] # 1/1/1 was a Monday https://github.com/python/cpython/blob/bac2d5ba30339298db7d4caa9c8cd31d807cf081/Modules/_datetimemodule.c#L355 | |
day,rest = timeinfo.split(' ', maxsplit=1) | |
weekday_then = names_of_days.index(day) | |
weekday_today = datetime.today().weekday() | |
shift = (weekday_today - weekday_then) % 7 | |
time = datetime.strptime(rest,'%I:%M %p').time() | |
date = datetime.today() - timedelta(days=shift) | |
return datetime.combine(date, time) | |
except ValueError: | |
pass | |
# now try last year: "Apr 19 4:20 PM" style | |
try: | |
dt = datetime.strptime(timeinfo, '%b %d %I:%M %p') | |
today = datetime.today() | |
#yearshift = 1 if dt.month > today.month else 0 # not needed because dates from last year have explicit dates | |
# there should be a better way to replace the year... | |
time = dt.time() | |
date = datetime(today.year, dt.month, dt.day) | |
return datetime.combine(date, time) | |
except ValueError: | |
pass | |
# now all that should be left is a full timestamp: "Nov 10 '16 12:36 AM" style | |
try: | |
return datetime.strptime(timeinfo, "%b %d '%y %I:%M %p") | |
except ValueError: | |
pass | |
raise ValueError(f'Unable to parse flexible date: {timeinfo}') | |
def parse_page(roomID, page): | |
"""Parse a single page of the starboard""" | |
SOroot = 'https://chat.stackoverflow.com' | |
dat = requests.get(f'https://chat.stackoverflow.com/rooms/info/{roomID}?tab=stars&page={page}') | |
content = dat.text | |
soup = BS(content, 'html.parser') | |
# loop over stars | |
starinfo = [] | |
for entry in soup.find_all('div','monologue'): | |
starcount = int(entry.find('span', 'times').text or '1') | |
msgURL = SOroot + entry.find('div', 'message').a['href'] | |
msgID = int(msgURL.rpartition('#')[-1]) | |
# try extracting a link for oneboxen, leave text for the rest | |
onebox = entry.find('div', 'onebox') | |
if onebox: | |
# TODO: some onebox types may be broken here; only twitter and youtube and images tested | |
msg = onebox.find_all('a')[-1]['href'] | |
if msg.startswith('//'): | |
# internal link, missing protocol | |
msg = f'https:{msg}' | |
else: | |
msg = entry.find('div', 'content').text.strip() | |
userinfo = entry.find('div', 'username') | |
username = userinfo.text | |
if userinfo.a: | |
# user exists | |
userURL = SOroot + userinfo.a['href'] | |
userID = int(userinfo.a['href'].split('/')[2]) | |
else: | |
# user has been deleted | |
userID = int(username[4:]) | |
userURL = f'{SOroot}/users/{userID}' # not that it exists, just for show | |
timeinfo = entry.find('div', 'timestamp').text | |
timestamp = normalize_timestamp(timeinfo) | |
starinfo.append({ | |
'msgID': msgID, | |
'timestamp': timestamp.timestamp(), # raw datetime is not json serializable :( | |
'readable date': datetime.strftime(timestamp,'%Y %b %d %H:%M'), | |
'userID': userID, | |
'username': username, | |
'msgURL': msgURL, | |
'starcount': starcount, | |
'msg': msg, | |
}) | |
return starinfo | |
if __name__ == "__main__": | |
pages = get_pagecount(roomID) | |
starinfo = [] | |
for page in range(1,pages+1): | |
starinfo.extend(parse_page(roomID, page)) | |
print(f'Done with page {page}/{pages}') | |
starinfo.sort(key=itemgetter('timestamp'), reverse=True) | |
with open('star_data.out','w') as outf: | |
json.dump(starinfo,outf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment