Skip to content

Instantly share code, notes, and snippets.

@bitmingw
Last active May 27, 2016 03:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bitmingw/0607956447ed0dfc6c3bcdaf42fff550 to your computer and use it in GitHub Desktop.
Save bitmingw/0607956447ed0dfc6c3bcdaf42fff550 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Web crawler and analysis of matrix67's ideagen
Author: Ming Wen
"""
from bs4 import BeautifulSoup
from urllib.request import urlopen
from collections import defaultdict
import random
import pylab
def crawler(log_file_name, request_number):
"""
Gather data by making HTTP requests, then save to the file.
log_file_name: str, file name to save data.
request_number: int, total number of desired requests.
Return void.
"""
IDEAGEN_URL = "http://www.matrix67.com/ideagen/"
current_number_req = 0
# Read number of existed record in the log file
# If the file is not exist, create one
try:
f = open(log_file_name, "r", encoding="utf-8")
existed_data = f.readlines()
current_number_req = len(existed_data)
except FileNotFoundError:
f = open(log_file_name, "w", encoding="utf-8")
finally:
f.close()
# Fill the file with new content
with open(log_file_name, "a", encoding="utf-8") as f:
while current_number_req < request_number:
# Get new requests
req = urlopen(IDEAGEN_URL)
content_bytes = req.read()
content_str = content_bytes.decode()
# Parse with BeautifulSoup
soup = BeautifulSoup(content_str, "html.parser")
word = soup.div.p.string
word = word.strip()
word += "\n"
print(current_number_req+1, word)
# Save the result to file
f.writelines([word])
# Update the counter
current_number_req += 1
def split_word(log_file_name):
"""
Read data from a file, split each word by the delimiter.
log_file_name: str, data file name.
Return
- adjectives: list of str, before delimiter (exclusive).
- nouns: list of str, after delimiter.
"""
adjectives = []
nouns = []
with open(log_file_name, "r", encoding="utf-8") as f:
for word in f:
# There could be multiple delimit characters
# Assume the first one is what we want
delimit_position = word.find("的")
adj = word[:delimit_position]
n = word[delimit_position+1:]
adjectives.append(adj)
nouns.append(n)
return adjectives, nouns
def list_count(li):
"""
Count the number of repeated elements in a list.
li: list of ?
Return: dict, key = word, value = number of repeat.
"""
count_dict = defaultdict(int)
for elem in li:
count_dict[elem] += 1
return count_dict
def simulate_count(class_number, sample_number):
"""
Simulate the sampling process on elements with replacement.
class_number: int, total number of classes
sample_number: int, total number of samples
Return: dict, key = element, value = number of repeat.
"""
random.seed("matrix67")
li = [random.randrange(0, class_number) for i in range(sample_number)]
return list_count(li)
def plot_hist(count_dict, bins, title_str):
"""
Plot the histogram of number of repeat for each word.
count_dict: dict, key = word, value = number of repeat.
bins: int, the number of bins in the histogram.
title_str: str, the title of figure.
"""
numbers = list(count_dict.values())
pylab.hist(numbers, bins=bins)
pylab.xlabel("Number of Repeat")
pylab.ylabel("Frequency")
pylab.title(title_str)
pylab.show()
if __name__ == "__main__":
TOTAL_REQUESTS = 1000000
crawler("record.txt", TOTAL_REQUESTS)
adjectives, nouns = split_word("record.txt")
adj_dict = list_count(adjectives)
noun_dict = list_count(nouns)
adj_dict_num = len(adj_dict.keys())
noun_dict_num = len(noun_dict.keys())
print("Total number of adjectives", adj_dict_num)
print("Total number of nouns", noun_dict_num)
simulate_adj_dict = simulate_count(adj_dict_num, TOTAL_REQUESTS)
simulate_noun_dict = simulate_count(noun_dict_num, TOTAL_REQUESTS)
plot_hist(adj_dict, 30, "Adjectives Distribution")
plot_hist(noun_dict, 30, "Nouns Distribution")
plot_hist(simulate_adj_dict, 30, "Simulated Adjectives Distribution")
plot_hist(simulate_noun_dict, 30, "Simulated Nouns Distribution")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment