Skip to content

Instantly share code, notes, and snippets.

@manodeep
Created March 16, 2018 04:51
Show Gist options
  • Save manodeep/e06ba7ecbb11c3bd2d4c749060e49c55 to your computer and use it in GitHub Desktop.
Save manodeep/e06ba7ecbb11c3bd2d4c749060e49c55 to your computer and use it in GitHub Desktop.
Example code to scrape the CAS colloquia page
def check_quantity_has_only_one_element(list_of_quant):
values = []
for val, name in list_of_quant:
try:
if len(val) > 1:
msg = "{0} must be a scalar quantity or an 1-element array.\n"\
"Found = {1} instead".format(name, val)
v = val[0]
except TypeError:
# scalar quantity has no len
v = val
pass
values.append(v)
return values
def download_web_talks_for_year(year):
import datetime
import requests
try:
from bs4 import BeautifulSoup
except ImportError:
from BeautifulSoup import BeautifulSoup
# Need to pass a list of tuples; the first element is the
# value itself, while the second is the human-understandable
# name of the variable
year = check_quantity_has_only_one_element([(year, "year")])
now = datetime.datetime.now()
current_year = now.year
min_year = 2000 # CAS only has records going back to 2000
if year > current_year or year < min_year:
msg = "Year = {0} should be within [{1}, {2}]".format(year, min_year, current_year)
raise ValueError(msg)
if year <= 2010:
url = "http://astronomy.swin.edu.au/research/colloquia_{0}.html".format(year)
else:
url = "http://astronomy.swin.edu.au/research/colloquia.php?year={0}".format(year)
r = requests.get(url)
if r.status_code != 200:
msg = "Encountered error while fetching webpage {0}".format(url)
raise RuntimeError(msg)
c = r.content
soup = BeautifulSoup(c, "lxml")
# search for individual talk entries
talks = soup.findAll(["div"], {"class":["talk"]})
return talks
def parse_talks_for_year(year,
requested_talk_type="Colloquium",
text_type="abstract",
save_image=False, **wc_kwargs):
from wordcloud import WordCloud
check_quantity_has_only_one_element([(requested_talk_type, "requested talk type"),
(text_type, "text section to parse"),
])
valid_talk_types = ["Colloquium"]
if not requested_talk_type in valid_talk_types:
msg = "Please request talks of one of these types = {0}"\
.format(valid_talk_types)
raise ValueError(msg)
valid_text_types = ["abstract", "title"]
if not text_type in valid_text_types:
msg = "Please request the text from these following sections = {0}"\
.format(valid_text_types)
raise ValueError(msg)
talks = download_web_talks_for_year(year)
ntalks = len(talks)
if ntalks == 0:
return None
all_text_content = ""
talks_of_correct_type = 0
for talk in talks:
if year >= 2011:
talk_type = talk.find(["span"], {"class":["title"]})
if not requested_talk_type.upper() in talk_type.text.upper():
continue
talks_of_correct_type += 1
text_content = talk.find(["div", "span"], {"class":[text_type]})
# there was no abstract field for this talk
if text_content is None:
continue
# extract the text for the abstract
text_content = text_content.text
# short text in the content (abstract, talk title) are
# probably a sign that no relevant text was supplied/found
if len(text_content) <= 50:
continue
if "TBC" in text_content or "TBA" in text_content:
print("Found TBC or TBA in the {0} text field".format(text_type))
continue
all_text_content = "\n".join([all_text_content, text_content])
print("Found {0} {1} talks (out of a total of {2} talks) for year {3}".
format(talks_of_correct_type, requested_talk_type, ntalks, year))
if len(all_text_content) == 0:
return None
# replace galaxy with galaxies
all_text_content = all_text_content.replace("galaxy", "galaxies")
all_text_content = all_text_content.replace("Galaxy", "galaxies")
wordcloud = WordCloud(**wc_kwargs).generate(all_text_content)
if save_image:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
fig.savefig("{0}-{1}-{2}-cat.png".format(requested_talk_type, text_type, year))
plt.close()
return wordcloud, all_text_content
years = [2000 + y for y in range(19)]
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)
stopwords.add("will")
stopwords.add("results")
stopwords.add("observations")
stopwords.add("abstract")
stopwords.add("discuss")
stopwords.add("provide")
stopwords.add("use")
stopwords.add("show")
stopwords.add("based")
stopwords.add("us")
stopwords.add("using")
stopwords.add("used")
stopwords.add("now")
stopwords.add("within")
stopwords.add("Colloquium")
stopwords.add("still")
width = 1600
height = 1200
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment