Skip to content

Instantly share code, notes, and snippets.

@yc0
Created October 5, 2018 09:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yc0/bac8928e22e51a37c2594dbffe458046 to your computer and use it in GitHub Desktop.
Save yc0/bac8928e22e51a37c2594dbffe458046 to your computer and use it in GitHub Desktop.
Crawl Information of Openstack Stack Summit
from datetime import datetime
from lxml import etree
import requests as req
import csv
import re
def handle_slide(slide):
return 'https://www.openstack.org'+slide if slide and re.match(r'^/asset',slide) else ""
def handle_video(youtube_id):
return "https://www.youtube.com/embed/"+youtube_id if youtube_id else ""
def handle_speakers(speakers):
return ",".join(map(lambda v:v.get('name'), speakers))
def handle_tags(tags):
return ",".join(map(lambda v:v.get('tag'), tags))
def main():
iterate = 0
total = 0
_slug = None
rows = []
rows.append(['Title','Date','#Views','YoutubeID','Slide','Tracks','Speakers','Tags','Video','Description'])
while True:
resp = req.get('https://www.openstack.org/videos/api/videos?group=summit&id=vancouver-2018&start='+str(iterate))
data = resp.json()
if 'summit' in data:
print(data.get('summit').get('id'))
print(data.get('summit').get('title'))
print(data.get('summit').get('dates'))
print(data.get('summit').get('slug'))
total = data.get('videoCount')
_slug = data.get('summit').get('slug')
if "results" in data:
cur = len(data.get('results'))
print("number of articles : {}".format(cur))
rsts = data.get('results')
for item in rsts:
date = datetime.strptime(item.get('dateUTC'), "%Y-%m-%d %H:%M:%S")
title = item.get('title')
views = int(item.get('views'))
youtubeID = item.get('youtubeID')
slide = handle_slide(item.get('slides'))
video = handle_video(youtubeID)
speakers = handle_speakers(item.get('speakers'))
tags = handle_tags(item.get('tags'))
tracks =item.get('track').get('title')
slug = item.get('slug')
resp = req.get("{}{}".format('https://www.openstack.org/videos/api/video/',slug))
# print("{}{}/{}".format('https://www.openstack.org/videos/',_slug,slug))
# print(resp)
SEL = etree.HTML(resp.json().get('description'))
paragraphs = []
_paragraphs = SEL.xpath('//p/text()')+SEL.xpath('//p//span/text()')
desc = None
if len(_paragraphs) == 1:
desc = _paragraphs[0]
else:
count = 1
for i in range(len(_paragraphs)):
p = _paragraphs[i]
p.replace('|','|')
p = p.strip()
p = p.strip(" ")
p = p.strip(" ")
if not p:
continue
paragraphs += ["[{}]".format(count)+p]
count+=1
# print(p.strip())
desc = "".join(paragraphs)
rows.append([title,date,views,youtubeID,slide,tracks, speakers, tags,video,desc])
if "has_more" in data:
if bool(data.get('has_more')):
iterate += 50
else:
break
with open('data.csv', 'w', newline='\n', encoding='utf8') as f:
writer = csv.writer(f, delimiter='|')
# quotechar='|', quoting=csv.QUOTE_MINIMAL)
for r in rows:
writer.writerow(r)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment