Skip to content

Instantly share code, notes, and snippets.

@RamonYeung
Created April 15, 2018 14:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RamonYeung/d44336016e69b227669fe2fd6293d9f9 to your computer and use it in GitHub Desktop.
Save RamonYeung/d44336016e69b227669fe2fd6293d9f9 to your computer and use it in GitHub Desktop.
Automatically Download IJCAI 2017 Accepted Papers with customized keywords
import requests
import re
from urllib.request import urlopen
import multiprocessing as mp
import os
def process(pair):
idx, title = pair
idx = '0' * (4 - len(idx)) + idx
for word in keywords:
if word in title.lower():
print(idx, title)
file_name = '../data/%s.pdf' % idx
with urlopen(pdf % idx) as response, open(file_name, 'wb') as f:
f.write(response.read()) # bytes obj
# pdf integrity checking, can be done manually.
with open(file_name, 'wb') as f:
if not len(f.read()):
print('broken file', file_name, 'try to download it again ...')
# repeat
with urlopen(pdf % idx) as response, open(file_name, 'wb') as g:
f.write(response.read())
break
url = 'https://www.ijcai.org/proceedings/2017/'
pdf = 'https://www.ijcai.org/proceedings/2017/%s.pdf'
keywords = ['knowledge', 'reason', 'question', 'answer', 'relation', 'entity', 'ontolo', 'embed']
regex = re.compile(r'<div id="paper([0-9]+)" class="paper_wrapper"><div class="title">(.*?)</div>')
r = requests.get(url)
content = re.findall(regex, r.text)
if not os.path.exists('../data'):
os.mkdir('../data')
print('Creating directory "data/" for papers ...')
with mp.Pool(10) as p:
p.map(process, content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment