Created
April 15, 2017 03:59
-
-
Save kueller/d7ce15ea130f28e58f397dadd2fa3675 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import textwrap | |
from bs4 import BeautifulSoup | |
# Writes a block of text to the file outf. | |
# Lines are formatted to be no longer than 75 characters. | |
# Special characters are reinterpreted to appear closer to how they | |
# appear in the browser. | |
# It is not perfect yet but it's the closest I've gotten so far. | |
def print_block(block, outf): | |
nl_count = 0 | |
for line in block.strip().replace('\xa0', '\n\n').split('\n'): | |
if len(line) > 70: | |
wlines = textwrap.wrap(line, 70) | |
for wline in wlines: | |
outf.write('\t' + wline + '\n') | |
else: | |
if line.strip() == '': | |
if nl_count < 2: | |
outf.write('\n') | |
nl_count = nl_count + 1 | |
else: | |
outf.write('\t' + line + '\n') | |
nl_count = 0 | |
# Recusrively prints out the quoted posts in the message. | |
# The oldest quoted post is the inner most div. | |
# Newest post is the outermost. | |
# Function therefore prints them in descending chronological order, or | |
# quote order. | |
# The initial call can discard the return value. | |
def print_quote(msg, outf): | |
if msg.blockquote is None: | |
return '' | |
txt = msg.blockquote.text.replace('wrote:', 'wrote: ') | |
print_block(txt.replace(print_quote(msg.blockquote, outf), ''), outf) | |
outf.write('\n') | |
return txt | |
def print_html(filename, outf): | |
with open(filename, "r", encoding='utf-8') as f: | |
text = f.read() | |
# Breaking apart the posts of the page using Beautiful Soup. | |
# Specific parsing is done in the loop. | |
soup = BeautifulSoup(text, "html.parser") | |
usersoups = [] | |
for link in soup.find_all("div", class_="lia-message-author-username"): | |
usersoups.append(BeautifulSoup(str(link), "html.parser")) | |
msgsoups = [] | |
for link in soup.find_all("div", class_="lia-message-body-content"): | |
msgsoups.append(BeautifulSoup(str(link), "html.parser")) | |
datesoups = [] | |
for link in soup.find_all("span", class_="lia-message-posted-on"): | |
datesoups.append(BeautifulSoup(str(link), "html.parser")) | |
# Loop to format the text of the page. | |
# This can be modified to be of a different format. | |
for user, msg, date in zip(usersoups, msgsoups, datesoups): | |
# GETTING DATA | |
# Self explanatory username and date of a post | |
username = user.get_text().strip() | |
date = date.get_text().strip() | |
# The body text of the post, minus signature or quotes | |
text = msg.get_text() | |
# User signature section of the post | |
sig = '' | |
# OBJECT head for the post quoting. | |
# see print_quote for more | |
quotes = msg.blockquote | |
# Get rid of the signature and quotes from the main text, | |
# so it can be handled independently. | |
if msg.div.div is not None: | |
sig = msg.div.div.text | |
text = msg.get_text().replace(sig, '') | |
if quotes is not None: | |
if quotes.text.strip() != "": | |
text = text.replace(quotes.text, '') | |
# WRITING TO FILE | |
# Write username and date | |
outf.write(username + '\n') | |
outf.write(date + '\n\n') | |
# If there are quoted posts, write those in order. | |
# See print_quotes for more. | |
if quotes is not None: | |
if quotes.text.strip() != "": | |
outf.write("QUOTES:\n") | |
print_quote(msg, outf) | |
# Write the body text | |
outf.write("BODY:\n") | |
print_block(text, outf) | |
outf.write('\n\n\n') | |
outf.write('_' * 80) | |
outf.write("\n\n\n\n") | |
def main(): | |
# Compiles the entire thread into one mega text file. | |
# HTML files are in a folder called "scraped". | |
# One HTML file per thread page. | |
with open("thread.txt", "w") as f: | |
print("Parsing HTML...") | |
for filename in sorted(os.listdir("scraped")): | |
print(filename) | |
print_html("scraped/"+filename, f) | |
# Read the contents of the newly created mega file for parsing. | |
with open("thread.txt", "r") as f: | |
text = f.read() | |
# Posts are separated by a long line in between. | |
# There's a line at the end, so ignore the blank post | |
# generated by the split function. | |
posts = text.split("_" * 80) | |
posts.pop(len(posts) - 1) | |
# Extracts the year. The thread will be broken up into | |
# separate files per year (2013-2017). | |
year = posts[0].split('\n')[1].split('-')[2] | |
f = open("output/greg_" + str(year) + ".txt", "w") | |
print("\nParsing complete. Writing output files...") | |
print("output/greg_" + str(year) + ".txt") | |
for post in posts: | |
post_date = post.strip().split('\n')[1] | |
# Some files do not have a proper date ("yesterday", "last week"). | |
# Ignore those and consider them as part of the current year. | |
if len(post_date.split('-')) < 3: | |
year_next = year | |
else: | |
year_next = post_date.split('-')[2] | |
if year_next != year: | |
year = year_next | |
f.close() | |
f = open("output/greg_" + str(year) + ".txt", "w") | |
print("output/greg_" + str(year) + ".txt") | |
# Write the new file and re-insert the separating line. | |
f.write(post + "_" * 80) | |
f.close() | |
print("\nAll done!") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Calls wget in a shell to get a specified number of pages. | |
import os | |
import requests | |
import json | |
baseurl = "https://community.lego.com/t5/LEGO-General/Chat-with-Greg-Farshtey/td-p/6605180/page" | |
def get_page(p): | |
filename = "scraped/%03d" % p | |
url = "%s/%d" % (baseurl, p) | |
os.system("wget \"%s\" -O %s" % (url, filename)) | |
for page in range(1,925): | |
get_page(page) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment