Created
October 5, 2018 09:51
-
-
Save yc0/bac8928e22e51a37c2594dbffe458046 to your computer and use it in GitHub Desktop.
Crawl Information of Openstack Stack Summit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
from lxml import etree | |
import requests as req | |
import csv | |
import re | |
def handle_slide(slide): | |
return 'https://www.openstack.org'+slide if slide and re.match(r'^/asset',slide) else "" | |
def handle_video(youtube_id): | |
return "https://www.youtube.com/embed/"+youtube_id if youtube_id else "" | |
def handle_speakers(speakers): | |
return ",".join(map(lambda v:v.get('name'), speakers)) | |
def handle_tags(tags): | |
return ",".join(map(lambda v:v.get('tag'), tags)) | |
def main(): | |
iterate = 0 | |
total = 0 | |
_slug = None | |
rows = [] | |
rows.append(['Title','Date','#Views','YoutubeID','Slide','Tracks','Speakers','Tags','Video','Description']) | |
while True: | |
resp = req.get('https://www.openstack.org/videos/api/videos?group=summit&id=vancouver-2018&start='+str(iterate)) | |
data = resp.json() | |
if 'summit' in data: | |
print(data.get('summit').get('id')) | |
print(data.get('summit').get('title')) | |
print(data.get('summit').get('dates')) | |
print(data.get('summit').get('slug')) | |
total = data.get('videoCount') | |
_slug = data.get('summit').get('slug') | |
if "results" in data: | |
cur = len(data.get('results')) | |
print("number of articles : {}".format(cur)) | |
rsts = data.get('results') | |
for item in rsts: | |
date = datetime.strptime(item.get('dateUTC'), "%Y-%m-%d %H:%M:%S") | |
title = item.get('title') | |
views = int(item.get('views')) | |
youtubeID = item.get('youtubeID') | |
slide = handle_slide(item.get('slides')) | |
video = handle_video(youtubeID) | |
speakers = handle_speakers(item.get('speakers')) | |
tags = handle_tags(item.get('tags')) | |
tracks =item.get('track').get('title') | |
slug = item.get('slug') | |
resp = req.get("{}{}".format('https://www.openstack.org/videos/api/video/',slug)) | |
# print("{}{}/{}".format('https://www.openstack.org/videos/',_slug,slug)) | |
# print(resp) | |
SEL = etree.HTML(resp.json().get('description')) | |
paragraphs = [] | |
_paragraphs = SEL.xpath('//p/text()')+SEL.xpath('//p//span/text()') | |
desc = None | |
if len(_paragraphs) == 1: | |
desc = _paragraphs[0] | |
else: | |
count = 1 | |
for i in range(len(_paragraphs)): | |
p = _paragraphs[i] | |
p.replace('|','|') | |
p = p.strip() | |
p = p.strip(" ") | |
p = p.strip(" ") | |
if not p: | |
continue | |
paragraphs += ["[{}]".format(count)+p] | |
count+=1 | |
# print(p.strip()) | |
desc = "".join(paragraphs) | |
rows.append([title,date,views,youtubeID,slide,tracks, speakers, tags,video,desc]) | |
if "has_more" in data: | |
if bool(data.get('has_more')): | |
iterate += 50 | |
else: | |
break | |
with open('data.csv', 'w', newline='\n', encoding='utf8') as f: | |
writer = csv.writer(f, delimiter='|') | |
# quotechar='|', quoting=csv.QUOTE_MINIMAL) | |
for r in rows: | |
writer.writerow(r) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment