manodeep/scrape.py

## scrape.py
def check_quantity_has_only_one_element(list_of_quant):
    values = []
    for val, name in list_of_quant:
        try:
            if len(val) > 1:
                msg = "{0} must be a scalar quantity or an 1-element array.\n"\
                "Found = {1} instead".format(name, val)
            v = val[0]
        except TypeError:
            # scalar quantity has no len
            v = val
            pass

        values.append(v)

    return values


  def download_web_talks_for_year(year):
    import datetime
    import requests
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        from BeautifulSoup import BeautifulSoup

    # Need to pass a list of tuples; the first element is the
    # value itself, while the second is the human-understandable
    # name of the variable
    year = check_quantity_has_only_one_element([(year, "year")])

    now = datetime.datetime.now()
    current_year = now.year
    min_year = 2000 # CAS only has records going back to 2000
    if year > current_year or year < min_year:
        msg = "Year = {0} should be within [{1}, {2}]".format(year, min_year, current_year)
        raise ValueError(msg)
    if year <= 2010:
        url = "http://astronomy.swin.edu.au/research/colloquia_{0}.html".format(year)
    else:
        url = "http://astronomy.swin.edu.au/research/colloquia.php?year={0}".format(year)
    r = requests.get(url)
    if r.status_code != 200:
        msg = "Encountered error while fetching webpage {0}".format(url)
        raise RuntimeError(msg)

    c = r.content
    soup = BeautifulSoup(c, "lxml")

    # search for individual talk entries
    talks = soup.findAll(["div"], {"class":["talk"]})

    return talks


  def parse_talks_for_year(year,
                         requested_talk_type="Colloquium",
                         text_type="abstract",
                         save_image=False, **wc_kwargs):

    from wordcloud import WordCloud
    check_quantity_has_only_one_element([(requested_talk_type, "requested talk type"),
                                         (text_type, "text section to parse"),
                                        ])

    valid_talk_types = ["Colloquium"]
    if not requested_talk_type in valid_talk_types:
        msg = "Please request talks of one of these types = {0}"\
               .format(valid_talk_types)
        raise ValueError(msg)

    valid_text_types = ["abstract", "title"]
    if not text_type in valid_text_types:
        msg = "Please request the text from these following sections = {0}"\
               .format(valid_text_types)
        raise ValueError(msg)

    talks = download_web_talks_for_year(year)
    ntalks = len(talks)
    if ntalks == 0:
        return None

    all_text_content = ""
    talks_of_correct_type = 0
    for talk in talks:
        if year >= 2011:
            talk_type = talk.find(["span"], {"class":["title"]})
            if not requested_talk_type.upper() in talk_type.text.upper():
                continue

        talks_of_correct_type += 1

        text_content = talk.find(["div", "span"], {"class":[text_type]})

        # there was no abstract field for this talk
        if text_content is None:
            continue

        # extract the text for the abstract
        text_content = text_content.text

        # short text in the content (abstract, talk title) are
        # probably a sign that no relevant text was supplied/found
        if len(text_content) <= 50:
            continue

        if "TBC" in text_content or "TBA" in text_content:
            print("Found TBC or TBA in the {0} text field".format(text_type))
            continue

        all_text_content = "\n".join([all_text_content, text_content])

    print("Found {0} {1} talks (out of a total of {2} talks) for year {3}".
          format(talks_of_correct_type, requested_talk_type, ntalks, year))
    if len(all_text_content) == 0:
        return None

    # replace galaxy with galaxies
    all_text_content = all_text_content.replace("galaxy", "galaxies")
    all_text_content = all_text_content.replace("Galaxy", "galaxies")
    wordcloud = WordCloud(**wc_kwargs).generate(all_text_content)

    if save_image:
        import matplotlib.pyplot as plt
        fig = plt.figure(figsize=(12,12))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        fig.savefig("{0}-{1}-{2}-cat.png".format(requested_talk_type, text_type, year))
        plt.close()


    return wordcloud, all_text_content

years = [2000 + y for y in range(19)]

from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)
stopwords.add("will")
stopwords.add("results")
stopwords.add("observations")
stopwords.add("abstract")
stopwords.add("discuss")
stopwords.add("provide")
stopwords.add("use")
stopwords.add("show")
stopwords.add("based")
stopwords.add("us")
stopwords.add("using")
stopwords.add("used")
stopwords.add("now")
stopwords.add("within")
stopwords.add("Colloquium")
stopwords.add("still")

width = 1600
height = 1200
	def check_quantity_has_only_one_element(list_of_quant):
	values = []
	for val, name in list_of_quant:
	try:
	if len(val) > 1:
	msg = "{0} must be a scalar quantity or an 1-element array.\n"\
	"Found = {1} instead".format(name, val)
	v = val[0]
	except TypeError:
	# scalar quantity has no len
	v = val
	pass

	values.append(v)

	return values


	def download_web_talks_for_year(year):
	import datetime
	import requests
	try:
	from bs4 import BeautifulSoup
	except ImportError:
	from BeautifulSoup import BeautifulSoup

	# Need to pass a list of tuples; the first element is the
	# value itself, while the second is the human-understandable
	# name of the variable
	year = check_quantity_has_only_one_element([(year, "year")])

	now = datetime.datetime.now()
	current_year = now.year
	min_year = 2000 # CAS only has records going back to 2000
	if year > current_year or year < min_year:
	msg = "Year = {0} should be within [{1}, {2}]".format(year, min_year, current_year)
	raise ValueError(msg)
	if year <= 2010:
	url = "http://astronomy.swin.edu.au/research/colloquia_{0}.html".format(year)
	else:
	url = "http://astronomy.swin.edu.au/research/colloquia.php?year={0}".format(year)
	r = requests.get(url)
	if r.status_code != 200:
	msg = "Encountered error while fetching webpage {0}".format(url)
	raise RuntimeError(msg)

	c = r.content
	soup = BeautifulSoup(c, "lxml")

	# search for individual talk entries
	talks = soup.findAll(["div"], {"class":["talk"]})

	return talks









	def parse_talks_for_year(year,
	requested_talk_type="Colloquium",
	text_type="abstract",
	save_image=False, **wc_kwargs):

	from wordcloud import WordCloud
	check_quantity_has_only_one_element([(requested_talk_type, "requested talk type"),
	(text_type, "text section to parse"),
	])

	valid_talk_types = ["Colloquium"]
	if not requested_talk_type in valid_talk_types:
	msg = "Please request talks of one of these types = {0}"\
	.format(valid_talk_types)
	raise ValueError(msg)

	valid_text_types = ["abstract", "title"]
	if not text_type in valid_text_types:
	msg = "Please request the text from these following sections = {0}"\
	.format(valid_text_types)
	raise ValueError(msg)

	talks = download_web_talks_for_year(year)
	ntalks = len(talks)
	if ntalks == 0:
	return None

	all_text_content = ""
	talks_of_correct_type = 0
	for talk in talks:
	if year >= 2011:
	talk_type = talk.find(["span"], {"class":["title"]})
	if not requested_talk_type.upper() in talk_type.text.upper():
	continue

	talks_of_correct_type += 1

	text_content = talk.find(["div", "span"], {"class":[text_type]})

	# there was no abstract field for this talk
	if text_content is None:
	continue

	# extract the text for the abstract
	text_content = text_content.text

	# short text in the content (abstract, talk title) are
	# probably a sign that no relevant text was supplied/found
	if len(text_content) <= 50:
	continue

	if "TBC" in text_content or "TBA" in text_content:
	print("Found TBC or TBA in the {0} text field".format(text_type))
	continue

	all_text_content = "\n".join([all_text_content, text_content])

	print("Found {0} {1} talks (out of a total of {2} talks) for year {3}".
	format(talks_of_correct_type, requested_talk_type, ntalks, year))
	if len(all_text_content) == 0:
	return None

	# replace galaxy with galaxies
	all_text_content = all_text_content.replace("galaxy", "galaxies")
	all_text_content = all_text_content.replace("Galaxy", "galaxies")
	wordcloud = WordCloud(**wc_kwargs).generate(all_text_content)

	if save_image:
	import matplotlib.pyplot as plt
	fig = plt.figure(figsize=(12,12))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis("off")
	fig.savefig("{0}-{1}-{2}-cat.png".format(requested_talk_type, text_type, year))
	plt.close()


	return wordcloud, all_text_content

	years = [2000 + y for y in range(19)]

	from wordcloud import STOPWORDS
	stopwords = set(STOPWORDS)
	stopwords.add("will")
	stopwords.add("results")
	stopwords.add("observations")
	stopwords.add("abstract")
	stopwords.add("discuss")
	stopwords.add("provide")
	stopwords.add("use")
	stopwords.add("show")
	stopwords.add("based")
	stopwords.add("us")
	stopwords.add("using")
	stopwords.add("used")
	stopwords.add("now")
	stopwords.add("within")
	stopwords.add("Colloquium")
	stopwords.add("still")

	width = 1600
	height = 1200