marcossilva/icml_scrapper.py

## icml_scrapper.py
# We use the requests lib to make the HTTP GET request
import requests
r = requests.get("https://icml.cc/Conferences/2021/Schedule?type=Poster")

# And the BeautifulSoup lib to parse the HTML data
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')

# I used the select_one operator to use the CSS selector and get the element with class 'col-xs-12'
# and then the select to obtain all the divs with the onClick attribute
divs = soup.select_one(".col-xs-12").select('div[onClick]')

# This is a helper function to proccess each div extracting the desired info
def process_div(d):
    # The title and authors are obtained with the select_one on their respective divs and classes
    title = d.select_one('div.maincardBody').text
    authors = d.select_one('div.maincardFooter').text
    # The URL is built using the onClick parameters accessed through the attrs.get
    url = "https://icml.cc/Conferences/2021/Schedule?showEvent=" + d.attrs.get('onclick')[11:-1]

    return pd.Series({'title' : title, 'authors' : authors, 'url' : url})

# Here I used pandas as a simple way to create a DataFrame and save it as CSV locally
import pandas as pd
dfs = []
for d in divs[2:]:
    dfs.append(process_div(d))

df = pd.concat(dfs, axis=1).T

# As the PDF's papers are not available on ICML website I also built a Google Search Ready URL
# to improve the hability to one's find it
import urllib.parse
df['search_url'] = df.title.apply(lambda t : "https://www.google.com/search?q="+urllib.parse.quote_plus(t))
df.to_csv('icml_2021.csv')
	# We use the requests lib to make the HTTP GET request
	import requests
	r = requests.get("https://icml.cc/Conferences/2021/Schedule?type=Poster")

	# And the BeautifulSoup lib to parse the HTML data
	from bs4 import BeautifulSoup
	soup = BeautifulSoup(r.text, 'html.parser')

	# I used the select_one operator to use the CSS selector and get the element with class 'col-xs-12'
	# and then the select to obtain all the divs with the onClick attribute
	divs = soup.select_one(".col-xs-12").select('div[onClick]')

	# This is a helper function to proccess each div extracting the desired info
	def process_div(d):
	# The title and authors are obtained with the select_one on their respective divs and classes
	title = d.select_one('div.maincardBody').text
	authors = d.select_one('div.maincardFooter').text
	# The URL is built using the onClick parameters accessed through the attrs.get
	url = "https://icml.cc/Conferences/2021/Schedule?showEvent=" + d.attrs.get('onclick')[11:-1]

	return pd.Series({'title' : title, 'authors' : authors, 'url' : url})

	# Here I used pandas as a simple way to create a DataFrame and save it as CSV locally
	import pandas as pd
	dfs = []
	for d in divs[2:]:
	dfs.append(process_div(d))

	df = pd.concat(dfs, axis=1).T

	# As the PDF's papers are not available on ICML website I also built a Google Search Ready URL
	# to improve the hability to one's find it
	import urllib.parse
	df['search_url'] = df.title.apply(lambda t : "https://www.google.com/search?q="+urllib.parse.quote_plus(t))
	df.to_csv('icml_2021.csv')