Skip to content

Instantly share code, notes, and snippets.

@masakuni-ito
Created December 20, 2019 16:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save masakuni-ito/83a0d51063cc04c49b89c2e1e6f94a8d to your computer and use it in GitHub Desktop.
Save masakuni-ito/83a0d51063cc04c49b89c2e1e6f94a8d to your computer and use it in GitHub Desktop.
A crawler that collects events held in Shizuoka.
import sys
import csv
import re
import math
from time import sleep
from pyquery import PyQuery as pq
def parse(link: str):
row = []
title = ''
group_title = ''
group_title_link = ''
description = ''
dtstart = ''
dtend = ''
address = ''
dom = pq(link)
# event_title
if dom('title').text():
title = re.sub(' \- connpass$', '', dom('title').text().strip())
# group_title(href)
if dom('.group_title').text():
group_title = dom('.group_title').text().strip()
if dom('.group_title > a').attr('href'):
group_title_link = dom('.group_title > a').attr('href').strip()
# description
for n in dom('meta').items():
if n.attr('name') == 'description':
description = n.attr('content').strip()
break
# event_schedule_area
if dom('.dtstart > .value-title').attr('title'):
dtstart = dom('.dtstart > .value-title').attr('title').strip()
if dom('.dtend > .value-title').attr('title'):
dtend = dom('.dtend > .value-title').attr('title').strip()
# event_place_area
if dom('.event_place_area .adr').text():
address = dom('.event_place_area .adr').text().strip()
# add row
row.append(title)
row.append(link)
row.append(group_title)
row.append(group_title_link)
row.append(description)
row.append(dtstart)
row.append(dtend)
row.append(address)
return row
def main():
links = []
max_page_num = 0
try:
url = "https://connpass.com/search/?page=1&q=&start_from=2019%2F01%2F01&start_to=2019%2F12%2F31&prefectures=shizuoka&selectItem=shizuoka"
max_item_num = int(re.sub('\D', '', pq(url)('.main_h2').text().strip()))
max_page_num = math.ceil(max_item_num / 10)
except Exception as e:
print("ERROR: " + e.message)
sys.exit(1)
else:
sleep(6) # 感謝のsleep6秒
for page_num in range(max_page_num):
try:
url = "https://connpass.com/search/?page={}&q=&start_from=2019%2F01%2F01&start_to=2019%2F12%2F31&prefectures=shizuoka&selectItem=shizuoka".format(page_num)
for node in pq(url)('.event_detail_area').items():
href = pq(node).find('.event_title > a').attr('href')
if not href:
raise Exception('a tag has no href.')
links.append(href)
except Exception as e:
print("ERROR: " + e.message)
sys.exit(1)
else:
sleep(6) # 感謝のsleep6秒
for link in links:
rows = []
try:
rows.append(parse(link))
with open('events.csv', 'a') as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerows(rows)
except Exception as e:
print("ERROR: " + e.message + ": " + url)
sys.exit(1)
else:
sleep(6) # 感謝のsleep6秒
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment