Created
March 16, 2018 04:51
-
-
Save manodeep/e06ba7ecbb11c3bd2d4c749060e49c55 to your computer and use it in GitHub Desktop.
Example code to scrape the CAS colloquia page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def check_quantity_has_only_one_element(list_of_quant): | |
values = [] | |
for val, name in list_of_quant: | |
try: | |
if len(val) > 1: | |
msg = "{0} must be a scalar quantity or an 1-element array.\n"\ | |
"Found = {1} instead".format(name, val) | |
v = val[0] | |
except TypeError: | |
# scalar quantity has no len | |
v = val | |
pass | |
values.append(v) | |
return values | |
def download_web_talks_for_year(year): | |
import datetime | |
import requests | |
try: | |
from bs4 import BeautifulSoup | |
except ImportError: | |
from BeautifulSoup import BeautifulSoup | |
# Need to pass a list of tuples; the first element is the | |
# value itself, while the second is the human-understandable | |
# name of the variable | |
year = check_quantity_has_only_one_element([(year, "year")]) | |
now = datetime.datetime.now() | |
current_year = now.year | |
min_year = 2000 # CAS only has records going back to 2000 | |
if year > current_year or year < min_year: | |
msg = "Year = {0} should be within [{1}, {2}]".format(year, min_year, current_year) | |
raise ValueError(msg) | |
if year <= 2010: | |
url = "http://astronomy.swin.edu.au/research/colloquia_{0}.html".format(year) | |
else: | |
url = "http://astronomy.swin.edu.au/research/colloquia.php?year={0}".format(year) | |
r = requests.get(url) | |
if r.status_code != 200: | |
msg = "Encountered error while fetching webpage {0}".format(url) | |
raise RuntimeError(msg) | |
c = r.content | |
soup = BeautifulSoup(c, "lxml") | |
# search for individual talk entries | |
talks = soup.findAll(["div"], {"class":["talk"]}) | |
return talks | |
def parse_talks_for_year(year, | |
requested_talk_type="Colloquium", | |
text_type="abstract", | |
save_image=False, **wc_kwargs): | |
from wordcloud import WordCloud | |
check_quantity_has_only_one_element([(requested_talk_type, "requested talk type"), | |
(text_type, "text section to parse"), | |
]) | |
valid_talk_types = ["Colloquium"] | |
if not requested_talk_type in valid_talk_types: | |
msg = "Please request talks of one of these types = {0}"\ | |
.format(valid_talk_types) | |
raise ValueError(msg) | |
valid_text_types = ["abstract", "title"] | |
if not text_type in valid_text_types: | |
msg = "Please request the text from these following sections = {0}"\ | |
.format(valid_text_types) | |
raise ValueError(msg) | |
talks = download_web_talks_for_year(year) | |
ntalks = len(talks) | |
if ntalks == 0: | |
return None | |
all_text_content = "" | |
talks_of_correct_type = 0 | |
for talk in talks: | |
if year >= 2011: | |
talk_type = talk.find(["span"], {"class":["title"]}) | |
if not requested_talk_type.upper() in talk_type.text.upper(): | |
continue | |
talks_of_correct_type += 1 | |
text_content = talk.find(["div", "span"], {"class":[text_type]}) | |
# there was no abstract field for this talk | |
if text_content is None: | |
continue | |
# extract the text for the abstract | |
text_content = text_content.text | |
# short text in the content (abstract, talk title) are | |
# probably a sign that no relevant text was supplied/found | |
if len(text_content) <= 50: | |
continue | |
if "TBC" in text_content or "TBA" in text_content: | |
print("Found TBC or TBA in the {0} text field".format(text_type)) | |
continue | |
all_text_content = "\n".join([all_text_content, text_content]) | |
print("Found {0} {1} talks (out of a total of {2} talks) for year {3}". | |
format(talks_of_correct_type, requested_talk_type, ntalks, year)) | |
if len(all_text_content) == 0: | |
return None | |
# replace galaxy with galaxies | |
all_text_content = all_text_content.replace("galaxy", "galaxies") | |
all_text_content = all_text_content.replace("Galaxy", "galaxies") | |
wordcloud = WordCloud(**wc_kwargs).generate(all_text_content) | |
if save_image: | |
import matplotlib.pyplot as plt | |
fig = plt.figure(figsize=(12,12)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis("off") | |
fig.savefig("{0}-{1}-{2}-cat.png".format(requested_talk_type, text_type, year)) | |
plt.close() | |
return wordcloud, all_text_content | |
years = [2000 + y for y in range(19)] | |
from wordcloud import STOPWORDS | |
stopwords = set(STOPWORDS) | |
stopwords.add("will") | |
stopwords.add("results") | |
stopwords.add("observations") | |
stopwords.add("abstract") | |
stopwords.add("discuss") | |
stopwords.add("provide") | |
stopwords.add("use") | |
stopwords.add("show") | |
stopwords.add("based") | |
stopwords.add("us") | |
stopwords.add("using") | |
stopwords.add("used") | |
stopwords.add("now") | |
stopwords.add("within") | |
stopwords.add("Colloquium") | |
stopwords.add("still") | |
width = 1600 | |
height = 1200 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment