kueller/parse.py

## parse.py
import os
import textwrap
from bs4 import BeautifulSoup

# Writes a block of text to the file outf.
# Lines are formatted to be no longer than 75 characters.
# Special characters are reinterpreted to appear closer to how they
# appear in the browser.
# It is not perfect yet but it's the closest I've gotten so far.
def print_block(block, outf):
    nl_count = 0
    for line in block.strip().replace('\xa0', '\n\n').split('\n'):
        if len(line) > 70:
            wlines = textwrap.wrap(line, 70)
            for wline in wlines:
                outf.write('\t' + wline + '\n')
        else:
            if line.strip() == '':
                if nl_count < 2:
                    outf.write('\n')
                    nl_count = nl_count + 1
            else:
                outf.write('\t' + line + '\n')
                nl_count = 0

# Recusrively prints out the quoted posts in the message.
# The oldest quoted post is the inner most div.
# Newest post is the outermost.
# Function therefore prints them in descending chronological order, or
# quote order.
# The initial call can discard the return value.
def print_quote(msg, outf):
    if msg.blockquote is None:
        return ''

    txt = msg.blockquote.text.replace('wrote:', 'wrote: ')
    print_block(txt.replace(print_quote(msg.blockquote, outf), ''), outf)
    outf.write('\n')

    return txt

def print_html(filename, outf):
    with open(filename, "r", encoding='utf-8') as f:
        text = f.read()

    # Breaking apart the posts of the page using Beautiful Soup.
    # Specific parsing is done in the loop.

    soup = BeautifulSoup(text, "html.parser")
    usersoups = []

    for link in soup.find_all("div", class_="lia-message-author-username"):
        usersoups.append(BeautifulSoup(str(link), "html.parser"))

    msgsoups = []
    for link in soup.find_all("div", class_="lia-message-body-content"):
        msgsoups.append(BeautifulSoup(str(link), "html.parser"))

    datesoups = []
    for link in soup.find_all("span", class_="lia-message-posted-on"):
        datesoups.append(BeautifulSoup(str(link), "html.parser"))

    # Loop to format the text of the page.
    # This can be modified to be of a different format.
    for user, msg, date in zip(usersoups, msgsoups, datesoups):
        # GETTING DATA
        # Self explanatory username and date of a post
        username = user.get_text().strip()
        date = date.get_text().strip()

        # The body text of the post, minus signature or quotes
        text = msg.get_text()

        # User signature section of the post
        sig = ''

        # OBJECT head for the post quoting.
        # see print_quote for more
        quotes = msg.blockquote

        # Get rid of the signature and quotes from the main text,
        # so it can be handled independently.
        if msg.div.div is not None:
            sig = msg.div.div.text
            text = msg.get_text().replace(sig, '')

        if quotes is not None:
            if quotes.text.strip() != "":
                text = text.replace(quotes.text, '')

        # WRITING TO FILE
        # Write username and date
        outf.write(username + '\n')
        outf.write(date + '\n\n')

        # If there are quoted posts, write those in order.
        # See print_quotes for more.
        if quotes is not None:
            if quotes.text.strip() != "":
                outf.write("QUOTES:\n")
                print_quote(msg, outf)

        # Write the body text
        outf.write("BODY:\n")
        print_block(text, outf)
        outf.write('\n\n\n')
        outf.write('_' * 80)
        outf.write("\n\n\n\n")

def main():
    # Compiles the entire thread into one mega text file.
    # HTML files are in a folder called "scraped".
    # One HTML file per thread page.
    with open("thread.txt", "w") as f:
        print("Parsing HTML...")
        for filename in sorted(os.listdir("scraped")):
            print(filename)
            print_html("scraped/"+filename, f)

    # Read the contents of the newly created mega file for parsing.
    with open("thread.txt", "r") as f:
        text = f.read()

    # Posts are separated by a long line in between.
    # There's a line at the end, so ignore the blank post
    # generated by the split function.
    posts = text.split("_" * 80)
    posts.pop(len(posts) - 1)

    # Extracts the year. The thread will be broken up into
    # separate files per year (2013-2017).
    year = posts[0].split('\n')[1].split('-')[2]
    f = open("output/greg_" + str(year) + ".txt", "w")

    print("\nParsing complete. Writing output files...")
    print("output/greg_" + str(year) + ".txt")
    for post in posts:
        post_date = post.strip().split('\n')[1]

        # Some files do not have a proper date ("yesterday", "last week").
        # Ignore those and consider them as part of the current year.
        if len(post_date.split('-')) < 3:
            year_next = year
        else:
            year_next = post_date.split('-')[2]

        if year_next != year:
            year = year_next
            f.close()
            f = open("output/greg_" + str(year) + ".txt", "w")
            print("output/greg_" + str(year) + ".txt")

        # Write the new file and re-insert the separating line.
        f.write(post + "_" * 80)

    f.close()
    print("\nAll done!")

if __name__ == "__main__":
    main()

## scrape.py
# Calls wget in a shell to get a specified number of pages.

import os
import requests
import json

baseurl = "https://community.lego.com/t5/LEGO-General/Chat-with-Greg-Farshtey/td-p/6605180/page"

def get_page(p):
    filename = "scraped/%03d" % p
    url = "%s/%d" % (baseurl, p)
    os.system("wget \"%s\" -O %s" % (url, filename))

for page in range(1,925):
    get_page(page)
	import os
	import textwrap
	from bs4 import BeautifulSoup

	# Writes a block of text to the file outf.
	# Lines are formatted to be no longer than 75 characters.
	# Special characters are reinterpreted to appear closer to how they
	# appear in the browser.
	# It is not perfect yet but it's the closest I've gotten so far.
	def print_block(block, outf):
	nl_count = 0
	for line in block.strip().replace('\xa0', '\n\n').split('\n'):
	if len(line) > 70:
	wlines = textwrap.wrap(line, 70)
	for wline in wlines:
	outf.write('\t' + wline + '\n')
	else:
	if line.strip() == '':
	if nl_count < 2:
	outf.write('\n')
	nl_count = nl_count + 1
	else:
	outf.write('\t' + line + '\n')
	nl_count = 0

	# Recusrively prints out the quoted posts in the message.
	# The oldest quoted post is the inner most div.
	# Newest post is the outermost.
	# Function therefore prints them in descending chronological order, or
	# quote order.
	# The initial call can discard the return value.
	def print_quote(msg, outf):
	if msg.blockquote is None:
	return ''

	txt = msg.blockquote.text.replace('wrote:', 'wrote: ')
	print_block(txt.replace(print_quote(msg.blockquote, outf), ''), outf)
	outf.write('\n')

	return txt

	def print_html(filename, outf):
	with open(filename, "r", encoding='utf-8') as f:
	text = f.read()

	# Breaking apart the posts of the page using Beautiful Soup.
	# Specific parsing is done in the loop.

	soup = BeautifulSoup(text, "html.parser")
	usersoups = []

	for link in soup.find_all("div", class_="lia-message-author-username"):
	usersoups.append(BeautifulSoup(str(link), "html.parser"))

	msgsoups = []
	for link in soup.find_all("div", class_="lia-message-body-content"):
	msgsoups.append(BeautifulSoup(str(link), "html.parser"))

	datesoups = []
	for link in soup.find_all("span", class_="lia-message-posted-on"):
	datesoups.append(BeautifulSoup(str(link), "html.parser"))

	# Loop to format the text of the page.
	# This can be modified to be of a different format.
	for user, msg, date in zip(usersoups, msgsoups, datesoups):
	# GETTING DATA
	# Self explanatory username and date of a post
	username = user.get_text().strip()
	date = date.get_text().strip()

	# The body text of the post, minus signature or quotes
	text = msg.get_text()

	# User signature section of the post
	sig = ''

	# OBJECT head for the post quoting.
	# see print_quote for more
	quotes = msg.blockquote

	# Get rid of the signature and quotes from the main text,
	# so it can be handled independently.
	if msg.div.div is not None:
	sig = msg.div.div.text
	text = msg.get_text().replace(sig, '')

	if quotes is not None:
	if quotes.text.strip() != "":
	text = text.replace(quotes.text, '')

	# WRITING TO FILE
	# Write username and date
	outf.write(username + '\n')
	outf.write(date + '\n\n')

	# If there are quoted posts, write those in order.
	# See print_quotes for more.
	if quotes is not None:
	if quotes.text.strip() != "":
	outf.write("QUOTES:\n")
	print_quote(msg, outf)

	# Write the body text
	outf.write("BODY:\n")
	print_block(text, outf)
	outf.write('\n\n\n')
	outf.write('_' * 80)
	outf.write("\n\n\n\n")

	def main():
	# Compiles the entire thread into one mega text file.
	# HTML files are in a folder called "scraped".
	# One HTML file per thread page.
	with open("thread.txt", "w") as f:
	print("Parsing HTML...")
	for filename in sorted(os.listdir("scraped")):
	print(filename)
	print_html("scraped/"+filename, f)

	# Read the contents of the newly created mega file for parsing.
	with open("thread.txt", "r") as f:
	text = f.read()

	# Posts are separated by a long line in between.
	# There's a line at the end, so ignore the blank post
	# generated by the split function.
	posts = text.split("_" * 80)
	posts.pop(len(posts) - 1)

	# Extracts the year. The thread will be broken up into
	# separate files per year (2013-2017).
	year = posts[0].split('\n')[1].split('-')[2]
	f = open("output/greg_" + str(year) + ".txt", "w")

	print("\nParsing complete. Writing output files...")
	print("output/greg_" + str(year) + ".txt")
	for post in posts:
	post_date = post.strip().split('\n')[1]

	# Some files do not have a proper date ("yesterday", "last week").
	# Ignore those and consider them as part of the current year.
	if len(post_date.split('-')) < 3:
	year_next = year
	else:
	year_next = post_date.split('-')[2]

	if year_next != year:
	year = year_next
	f.close()
	f = open("output/greg_" + str(year) + ".txt", "w")
	print("output/greg_" + str(year) + ".txt")

	# Write the new file and re-insert the separating line.
	f.write(post + "_" * 80)

	f.close()
	print("\nAll done!")

	if __name__ == "__main__":
	main()
	# Calls wget in a shell to get a specified number of pages.

	import os
	import requests
	import json

	baseurl = "https://community.lego.com/t5/LEGO-General/Chat-with-Greg-Farshtey/td-p/6605180/page"

	def get_page(p):
	filename = "scraped/%03d" % p
	url = "%s/%d" % (baseurl, p)
	os.system("wget \"%s\" -O %s" % (url, filename))

	for page in range(1,925):
	get_page(page)