RamonYeung/ijcai_17_pp_title_with_keyword

## ijcai_17_pp_title_with_keyword
import requests
import re
from urllib.request import urlopen
import multiprocessing as mp
import os

def process(pair):
    idx, title = pair
    idx = '0' * (4 - len(idx)) + idx
    for word in keywords:
        if word in title.lower():
            print(idx, title)
            file_name = '../data/%s.pdf' % idx
            with urlopen(pdf % idx) as response, open(file_name, 'wb') as f:
                f.write(response.read())  # bytes obj

            # pdf integrity checking, can be done manually.
            with open(file_name, 'wb') as f:
                if not len(f.read()):
                    print('broken file', file_name, 'try to download it again ...')
                    # repeat
                    with urlopen(pdf % idx) as response, open(file_name, 'wb') as g:
                        f.write(response.read())
            break

url = 'https://www.ijcai.org/proceedings/2017/'
pdf = 'https://www.ijcai.org/proceedings/2017/%s.pdf'
keywords = ['knowledge', 'reason', 'question', 'answer', 'relation', 'entity', 'ontolo', 'embed']
regex = re.compile(r'<div id="paper([0-9]+)" class="paper_wrapper"><div class="title">(.*?)</div>')

r = requests.get(url)
content = re.findall(regex, r.text)

if not os.path.exists('../data'):
    os.mkdir('../data')
    print('Creating directory "data/" for papers ...')

with mp.Pool(10) as p:
    p.map(process, content)
	import requests
	import re
	from urllib.request import urlopen
	import multiprocessing as mp
	import os

	def process(pair):
	idx, title = pair
	idx = '0' * (4 - len(idx)) + idx
	for word in keywords:
	if word in title.lower():
	print(idx, title)
	file_name = '../data/%s.pdf' % idx
	with urlopen(pdf % idx) as response, open(file_name, 'wb') as f:
	f.write(response.read()) # bytes obj

	# pdf integrity checking, can be done manually.
	with open(file_name, 'wb') as f:
	if not len(f.read()):
	print('broken file', file_name, 'try to download it again ...')
	# repeat
	with urlopen(pdf % idx) as response, open(file_name, 'wb') as g:
	f.write(response.read())
	break

	url = 'https://www.ijcai.org/proceedings/2017/'
	pdf = 'https://www.ijcai.org/proceedings/2017/%s.pdf'
	keywords = ['knowledge', 'reason', 'question', 'answer', 'relation', 'entity', 'ontolo', 'embed']
	regex = re.compile(r'<div id="paper([0-9]+)" class="paper_wrapper"><div class="title">(.*?)</div>')

	r = requests.get(url)
	content = re.findall(regex, r.text)

	if not os.path.exists('../data'):
	os.mkdir('../data')
	print('Creating directory "data/" for papers ...')

	with mp.Pool(10) as p:
	p.map(process, content)