Skip to content

Instantly share code, notes, and snippets.

@adeak
Last active April 26, 2018 19:12
Show Gist options
  • Save adeak/220f1994e5ed5fa23f599c74017b257e to your computer and use it in GitHub Desktop.
Save adeak/220f1994e5ed5fa23f599c74017b257e to your computer and use it in GitHub Desktop.
starscraper
Jerry Coffin : 1
Jason : 1
shuttle87 : 1
zounds : 1
Phillip Cloud : 1
BlackSheep : 1
noumenal : 1
Anaphory : 1
enginefree : 1
Brandon Lipman : 1
Undo : 1
Moinuddin Quadri : 1
coltonoscopy : 1
Danilo : 1
ExoticBirdsMerchant : 1
Dominico909 : 1
Damon : 1
Registered User : 1
TemporalWolf : 1
jjj : 1
mkingsbu : 1
Rainer Koirikivi : 1
Abdullah UYU : 1
CoderDude Twodee : 1
Joe : 1
MichaelHCameron : 1
gsw945 : 1
heather : 1
user6845426 : 1
Orange : 1
TheExorcist : 1
apronedsamurai : 1
Pizza lord : 1
RompePC : 1
Keyur Potdar : 1
Mr.Zeus : 1
Acrimonious Mirth : 1
Ignacio Vazquez-Abrams : 2
Oliver : 2
Mirko Cianfarani : 2
Ajit : 2
Joe : 2
Stefan Pochmann : 2
Brandin : 2
user1743752 : 2
Sohaib Asif : 2
Sankar Raj : 2
Jovito : 2
Steven Vascellaro : 2
user3504751 : 2
Tomas Zubiri : 2
Hexacoordinate-C : 2
gloriousCatnip : 2
Meet Taraviya : 2
Tristen : 2
Nathan777 : 2
Tim Post : 3
John Y : 3
Xavier Combelle : 3
Bibhas : 3
Danack : 3
Sudipta : 3
HamZa : 3
lciamp : 3
ddelemeny : 3
Noob Saibot : 3
Bonifacio2 : 3
QPaysTaxes : 3
Mirac7 : 3
isedev : 3
TigerhawkT3 : 3
ChillarAnand : 3
Christian Ternus : 3
Iplodman : 3
jQuery Angry Bird : 3
Sab : 3
Augusta : 3
AutomaticStatic : 3
MYGz : 3
Seraphim : 3
Ananthu : 3
Mark R. : 3
Cam_Aust : 3
faceless : 3
Terry : 3
CoderCat : 3
Dan Lugg : 4
wonderb0lt : 4
tripleee : 4
marxin : 4
Angelo Tricarico : 4
Mahesha999 : 4
Jerry : 4
Karin : 4
Jivan : 4
Vader : 4
David Cullen : 4
matsjoyce : 4
Jfach : 4
SuperBiasedMan : 4
Dracunos : 4
Anarach : 4
Reno : 5
Kneel-Before-ZOD : 5
Hyperboreus : 5
Reut Sharabani : 5
gerrit : 5
copy : 5
Ahmad : 5
Abhishek : 5
Zacrath : 5
Abhishek Bhatia : 5
rp372 : 5
iShaymus : 5
zondo : 5
Victoria : 5
Simeon Aleksov : 5
solarc : 6
kush : 6
Roman Luštrik : 6
ypercube : 6
Skyler : 6
Volatility : 6
Aruka J : 6
K DawG : 6
Matarata : 6
Olivier Melançon : 6
Sebastian Nielsen : 6
A. Smoliak : 6
BoltClock : 7
rlemon : 7
10 Replies : 7
Bonstark : 7
Daniel Roseman : 8
PeeHaa : 8
dead beef : 8
Alex Thornton : 8
aIKid : 8
James Dean : 8
Cody Piersall : 9
Terfin : 9
JVarhol : 9
Reblochon Masque : 9
kale : 9
abhi : 10
Badger Cat : 10
iCodez : 10
Jacque Goupil : 10
Awal Garg : 10
Patrick Maupin : 10
TheSoundDefense : 10
Tshepang : 11
Florian Margaine : 11
Daи : 11
Haidro : 11
Praveen Rawat : 11
user4433485 : 11
paul23 : 12
Paolo Casciello : 12
Rohit Barnwal : 12
Wooble : 13
alecxe : 13
Zack Tarr : 13
OneRaynyDay : 13
Alexander Huszagh : 13
Flexo : 14
excaza : 14
Carpetsmoker : 15
Unihedron : 15
tilaprimera : 16
Programmer : 16
Joran Beasley : 17
Sterling Archer : 17
Byte Commander : 17
mgilson : 18
roganjosh : 18
Gemtastic : 19
danidee : 20
Simon : 22
tzaman : 23
Sword : 23
Benjamin Gruenbaum : 24
Arne : 25
Aaron Hall : 26
roippi : 26
MattDMo : 28
Andy K : 30
Ilja Everilä : 31
JGrindal : 33
randomhopeful : 33
Lev Levitsky : 36
inspectorG4dget : 38
Games Brainiac : 38
Withnail : 39
enderland : 40
Adam Smith : 41
ThiefMaster : 43
toonarmycaptain : 44
PaulMcG : 47
user2357112 : 50
Wayne Conrad : 63
Feeds : 64
JGreenwell : 71
Ashish Nitin Patil : 75
khajvah : 77
bereal : 83
jonrsharpe : 84
Code-Apprentice : 86
Air : 90
cᴏʟᴅsᴘᴇᴇᴅ : 95
thefourtheye : 98
abarnert : 101
Marcus Andrews : 104
vaultah : 105
piRSquared : 106
Inbar Rose : 107
MooingRawr : 113
QuestionC : 149
holdenweb : 150
OldTinfoil : 152
J Richard Snape : 156
Bhargav Rao : 182
Wayne Werner : 247
Peter Varo : 254
Zero Piraeus : 282
Morgan Thrapp : 295
corvid : 300
Aran-Fey : 321
Kevin M Granger : 430
idjaw : 439
Martijn Pieters : 468
wim : 517
poke : 533
Robert Grant : 537
Antti Haapala : 589
Jon Clements : 626
PM 2Ring : 630
DSM : 759
Andras Deak : 944
davidism : 1029
Ffisegydd : 1251
user559633 : 1397
Kevin : 3870
import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime,timedelta
from operator import itemgetter
import json
roomID = 6
def get_pagecount(roomID):
"""Get number of pages in paginated starboard"""
dat = requests.get(f'https://chat.stackoverflow.com/rooms/info/{roomID}?tab=stars&page=999999') # sorry
content = dat.text
soup = BS(content, 'html.parser')
pageinfo = soup.find('span', 'page-numbers current')
numpages = 1 if not pageinfo else int(pageinfo.text)
return numpages
def normalize_timestamp(timeinfo):
"""Parse flexible SO timestamps and return appropriate datetime object"""
# first try to parse last 24 hours: "1:04 PM" style
try:
time = datetime.strptime(timeinfo, '%I:%M %p').time()
date = datetime.today()
return datetime.combine(date, time)
except ValueError:
pass
# now try yesterday: "yst 10:23 PM" style
if timeinfo.startswith('yst'):
time = datetime.strptime(timeinfo[4:],'%I:%M %p').time()
date = datetime.today() - timedelta(days=1)
return datetime.combine(date, time)
# now try last week: "Mon 5:33 PM" style
try:
names_of_days = [datetime.strftime(datetime(1,1,1) + timedelta(days=k), '%a') for k in range(7)] # 1/1/1 was a Monday https://github.com/python/cpython/blob/bac2d5ba30339298db7d4caa9c8cd31d807cf081/Modules/_datetimemodule.c#L355
day,rest = timeinfo.split(' ', maxsplit=1)
weekday_then = names_of_days.index(day)
weekday_today = datetime.today().weekday()
shift = (weekday_today - weekday_then) % 7
time = datetime.strptime(rest,'%I:%M %p').time()
date = datetime.today() - timedelta(days=shift)
return datetime.combine(date, time)
except ValueError:
pass
# now try last year: "Apr 19 4:20 PM" style
try:
dt = datetime.strptime(timeinfo, '%b %d %I:%M %p')
today = datetime.today()
#yearshift = 1 if dt.month > today.month else 0 # not needed because dates from last year have explicit dates
# there should be a better way to replace the year...
time = dt.time()
date = datetime(today.year, dt.month, dt.day)
return datetime.combine(date, time)
except ValueError:
pass
# now all that should be left is a full timestamp: "Nov 10 '16 12:36 AM" style
try:
return datetime.strptime(timeinfo, "%b %d '%y %I:%M %p")
except ValueError:
pass
raise ValueError(f'Unable to parse flexible date: {timeinfo}')
def parse_page(roomID, page):
"""Parse a single page of the starboard"""
SOroot = 'https://chat.stackoverflow.com'
dat = requests.get(f'https://chat.stackoverflow.com/rooms/info/{roomID}?tab=stars&page={page}')
content = dat.text
soup = BS(content, 'html.parser')
# loop over stars
starinfo = []
for entry in soup.find_all('div','monologue'):
starcount = int(entry.find('span', 'times').text or '1')
msgURL = SOroot + entry.find('div', 'message').a['href']
msgID = int(msgURL.rpartition('#')[-1])
# try extracting a link for oneboxen, leave text for the rest
onebox = entry.find('div', 'onebox')
if onebox:
# TODO: some onebox types may be broken here; only twitter and youtube and images tested
msg = onebox.find_all('a')[-1]['href']
if msg.startswith('//'):
# internal link, missing protocol
msg = f'https:{msg}'
else:
msg = entry.find('div', 'content').text.strip()
userinfo = entry.find('div', 'username')
username = userinfo.text
if userinfo.a:
# user exists
userURL = SOroot + userinfo.a['href']
userID = int(userinfo.a['href'].split('/')[2])
else:
# user has been deleted
userID = int(username[4:])
userURL = f'{SOroot}/users/{userID}' # not that it exists, just for show
timeinfo = entry.find('div', 'timestamp').text
timestamp = normalize_timestamp(timeinfo)
starinfo.append({
'msgID': msgID,
'timestamp': timestamp.timestamp(), # raw datetime is not json serializable :(
'readable date': datetime.strftime(timestamp,'%Y %b %d %H:%M'),
'userID': userID,
'username': username,
'msgURL': msgURL,
'starcount': starcount,
'msg': msg,
})
return starinfo
if __name__ == "__main__":
pages = get_pagecount(roomID)
starinfo = []
for page in range(1,pages+1):
starinfo.extend(parse_page(roomID, page))
print(f'Done with page {page}/{pages}')
starinfo.sort(key=itemgetter('timestamp'), reverse=True)
with open('star_data.out','w') as outf:
json.dump(starinfo,outf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment