bitmingw/ideagen.py

## ideagen.py
#!/usr/bin/env python3

"""
Web crawler and analysis of matrix67's ideagen
Author: Ming Wen
"""


from bs4 import BeautifulSoup
from urllib.request import urlopen
from collections import defaultdict
import random
import pylab


def crawler(log_file_name, request_number):
    """
    Gather data by making HTTP requests, then save to the file.
    log_file_name: str, file name to save data.
    request_number: int, total number of desired requests.
    Return void.
    """

    IDEAGEN_URL = "http://www.matrix67.com/ideagen/"

    current_number_req = 0

    # Read number of existed record in the log file
    # If the file is not exist, create one
    try:
        f = open(log_file_name, "r", encoding="utf-8")
        existed_data = f.readlines()
        current_number_req = len(existed_data)
    except FileNotFoundError:
        f = open(log_file_name, "w", encoding="utf-8")
    finally:
        f.close()

    # Fill the file with new content
    with open(log_file_name, "a", encoding="utf-8") as f:
        while current_number_req < request_number:

            # Get new requests
            req = urlopen(IDEAGEN_URL)
            content_bytes = req.read()
            content_str = content_bytes.decode()

            # Parse with BeautifulSoup
            soup = BeautifulSoup(content_str, "html.parser")
            word = soup.div.p.string
            word = word.strip()
            word += "\n"
            print(current_number_req+1, word)

            # Save the result to file
            f.writelines([word])

            # Update the counter
            current_number_req += 1


def split_word(log_file_name):
    """
    Read data from a file, split each word by the delimiter.
    log_file_name: str, data file name.
    Return
        - adjectives: list of str, before delimiter (exclusive).
        - nouns: list of str, after delimiter.
    """

    adjectives = []
    nouns = []
    with open(log_file_name, "r", encoding="utf-8") as f:
        for word in f:
            # There could be multiple delimit characters
            # Assume the first one is what we want
            delimit_position = word.find("的")
            adj = word[:delimit_position]
            n = word[delimit_position+1:]
            adjectives.append(adj)
            nouns.append(n)
    return adjectives, nouns


def list_count(li):
    """
    Count the number of repeated elements in a list.
    li: list of ?
    Return: dict, key = word, value = number of repeat.
    """

    count_dict = defaultdict(int)
    for elem in li:
        count_dict[elem] += 1
    return count_dict


def simulate_count(class_number, sample_number):
    """
    Simulate the sampling process on elements with replacement.
    class_number: int, total number of classes
    sample_number: int, total number of samples
    Return: dict, key = element, value = number of repeat.
    """

    random.seed("matrix67")
    li = [random.randrange(0, class_number) for i in range(sample_number)]
    return list_count(li)


def plot_hist(count_dict, bins, title_str):
    """
    Plot the histogram of number of repeat for each word.
    count_dict: dict, key = word, value = number of repeat.
    bins: int, the number of bins in the histogram.
    title_str: str, the title of figure.
    """

    numbers = list(count_dict.values())
    pylab.hist(numbers, bins=bins)
    pylab.xlabel("Number of Repeat")
    pylab.ylabel("Frequency")
    pylab.title(title_str)
    pylab.show()


if __name__ == "__main__":
    TOTAL_REQUESTS = 1000000
    crawler("record.txt", TOTAL_REQUESTS)
    adjectives, nouns = split_word("record.txt")
    adj_dict = list_count(adjectives)
    noun_dict = list_count(nouns)
    adj_dict_num = len(adj_dict.keys())
    noun_dict_num = len(noun_dict.keys())
    print("Total number of adjectives", adj_dict_num)
    print("Total number of nouns", noun_dict_num)
    simulate_adj_dict = simulate_count(adj_dict_num, TOTAL_REQUESTS)
    simulate_noun_dict = simulate_count(noun_dict_num, TOTAL_REQUESTS)
    plot_hist(adj_dict, 30, "Adjectives Distribution")
    plot_hist(noun_dict, 30, "Nouns Distribution")
    plot_hist(simulate_adj_dict, 30, "Simulated Adjectives Distribution")
    plot_hist(simulate_noun_dict, 30, "Simulated Nouns Distribution")
	#!/usr/bin/env python3

	"""
	Web crawler and analysis of matrix67's ideagen
	Author: Ming Wen
	"""


	from bs4 import BeautifulSoup
	from urllib.request import urlopen
	from collections import defaultdict
	import random
	import pylab


	def crawler(log_file_name, request_number):
	"""
	Gather data by making HTTP requests, then save to the file.
	log_file_name: str, file name to save data.
	request_number: int, total number of desired requests.
	Return void.
	"""

	IDEAGEN_URL = "http://www.matrix67.com/ideagen/"

	current_number_req = 0

	# Read number of existed record in the log file
	# If the file is not exist, create one
	try:
	f = open(log_file_name, "r", encoding="utf-8")
	existed_data = f.readlines()
	current_number_req = len(existed_data)
	except FileNotFoundError:
	f = open(log_file_name, "w", encoding="utf-8")
	finally:
	f.close()

	# Fill the file with new content
	with open(log_file_name, "a", encoding="utf-8") as f:
	while current_number_req < request_number:

	# Get new requests
	req = urlopen(IDEAGEN_URL)
	content_bytes = req.read()
	content_str = content_bytes.decode()

	# Parse with BeautifulSoup
	soup = BeautifulSoup(content_str, "html.parser")
	word = soup.div.p.string
	word = word.strip()
	word += "\n"
	print(current_number_req+1, word)

	# Save the result to file
	f.writelines([word])

	# Update the counter
	current_number_req += 1


	def split_word(log_file_name):
	"""
	Read data from a file, split each word by the delimiter.
	log_file_name: str, data file name.
	Return
	- adjectives: list of str, before delimiter (exclusive).
	- nouns: list of str, after delimiter.
	"""

	adjectives = []
	nouns = []
	with open(log_file_name, "r", encoding="utf-8") as f:
	for word in f:
	# There could be multiple delimit characters
	# Assume the first one is what we want
	delimit_position = word.find("的")
	adj = word[:delimit_position]
	n = word[delimit_position+1:]
	adjectives.append(adj)
	nouns.append(n)
	return adjectives, nouns


	def list_count(li):
	"""
	Count the number of repeated elements in a list.
	li: list of ?
	Return: dict, key = word, value = number of repeat.
	"""

	count_dict = defaultdict(int)
	for elem in li:
	count_dict[elem] += 1
	return count_dict


	def simulate_count(class_number, sample_number):
	"""
	Simulate the sampling process on elements with replacement.
	class_number: int, total number of classes
	sample_number: int, total number of samples
	Return: dict, key = element, value = number of repeat.
	"""

	random.seed("matrix67")
	li = [random.randrange(0, class_number) for i in range(sample_number)]
	return list_count(li)


	def plot_hist(count_dict, bins, title_str):
	"""
	Plot the histogram of number of repeat for each word.
	count_dict: dict, key = word, value = number of repeat.
	bins: int, the number of bins in the histogram.
	title_str: str, the title of figure.
	"""

	numbers = list(count_dict.values())
	pylab.hist(numbers, bins=bins)
	pylab.xlabel("Number of Repeat")
	pylab.ylabel("Frequency")
	pylab.title(title_str)
	pylab.show()


	if __name__ == "__main__":
	TOTAL_REQUESTS = 1000000
	crawler("record.txt", TOTAL_REQUESTS)
	adjectives, nouns = split_word("record.txt")
	adj_dict = list_count(adjectives)
	noun_dict = list_count(nouns)
	adj_dict_num = len(adj_dict.keys())
	noun_dict_num = len(noun_dict.keys())
	print("Total number of adjectives", adj_dict_num)
	print("Total number of nouns", noun_dict_num)
	simulate_adj_dict = simulate_count(adj_dict_num, TOTAL_REQUESTS)
	simulate_noun_dict = simulate_count(noun_dict_num, TOTAL_REQUESTS)
	plot_hist(adj_dict, 30, "Adjectives Distribution")
	plot_hist(noun_dict, 30, "Nouns Distribution")
	plot_hist(simulate_adj_dict, 30, "Simulated Adjectives Distribution")
	plot_hist(simulate_noun_dict, 30, "Simulated Nouns Distribution")