Skip to content

Instantly share code, notes, and snippets.

@kueller
Created April 15, 2017 03:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kueller/d7ce15ea130f28e58f397dadd2fa3675 to your computer and use it in GitHub Desktop.
Save kueller/d7ce15ea130f28e58f397dadd2fa3675 to your computer and use it in GitHub Desktop.
import os
import textwrap
from bs4 import BeautifulSoup
# Writes a block of text to the file outf.
# Lines are formatted to be no longer than 75 characters.
# Special characters are reinterpreted to appear closer to how they
# appear in the browser.
# It is not perfect yet but it's the closest I've gotten so far.
def print_block(block, outf):
nl_count = 0
for line in block.strip().replace('\xa0', '\n\n').split('\n'):
if len(line) > 70:
wlines = textwrap.wrap(line, 70)
for wline in wlines:
outf.write('\t' + wline + '\n')
else:
if line.strip() == '':
if nl_count < 2:
outf.write('\n')
nl_count = nl_count + 1
else:
outf.write('\t' + line + '\n')
nl_count = 0
# Recusrively prints out the quoted posts in the message.
# The oldest quoted post is the inner most div.
# Newest post is the outermost.
# Function therefore prints them in descending chronological order, or
# quote order.
# The initial call can discard the return value.
def print_quote(msg, outf):
if msg.blockquote is None:
return ''
txt = msg.blockquote.text.replace('wrote:', 'wrote: ')
print_block(txt.replace(print_quote(msg.blockquote, outf), ''), outf)
outf.write('\n')
return txt
def print_html(filename, outf):
with open(filename, "r", encoding='utf-8') as f:
text = f.read()
# Breaking apart the posts of the page using Beautiful Soup.
# Specific parsing is done in the loop.
soup = BeautifulSoup(text, "html.parser")
usersoups = []
for link in soup.find_all("div", class_="lia-message-author-username"):
usersoups.append(BeautifulSoup(str(link), "html.parser"))
msgsoups = []
for link in soup.find_all("div", class_="lia-message-body-content"):
msgsoups.append(BeautifulSoup(str(link), "html.parser"))
datesoups = []
for link in soup.find_all("span", class_="lia-message-posted-on"):
datesoups.append(BeautifulSoup(str(link), "html.parser"))
# Loop to format the text of the page.
# This can be modified to be of a different format.
for user, msg, date in zip(usersoups, msgsoups, datesoups):
# GETTING DATA
# Self explanatory username and date of a post
username = user.get_text().strip()
date = date.get_text().strip()
# The body text of the post, minus signature or quotes
text = msg.get_text()
# User signature section of the post
sig = ''
# OBJECT head for the post quoting.
# see print_quote for more
quotes = msg.blockquote
# Get rid of the signature and quotes from the main text,
# so it can be handled independently.
if msg.div.div is not None:
sig = msg.div.div.text
text = msg.get_text().replace(sig, '')
if quotes is not None:
if quotes.text.strip() != "":
text = text.replace(quotes.text, '')
# WRITING TO FILE
# Write username and date
outf.write(username + '\n')
outf.write(date + '\n\n')
# If there are quoted posts, write those in order.
# See print_quotes for more.
if quotes is not None:
if quotes.text.strip() != "":
outf.write("QUOTES:\n")
print_quote(msg, outf)
# Write the body text
outf.write("BODY:\n")
print_block(text, outf)
outf.write('\n\n\n')
outf.write('_' * 80)
outf.write("\n\n\n\n")
def main():
# Compiles the entire thread into one mega text file.
# HTML files are in a folder called "scraped".
# One HTML file per thread page.
with open("thread.txt", "w") as f:
print("Parsing HTML...")
for filename in sorted(os.listdir("scraped")):
print(filename)
print_html("scraped/"+filename, f)
# Read the contents of the newly created mega file for parsing.
with open("thread.txt", "r") as f:
text = f.read()
# Posts are separated by a long line in between.
# There's a line at the end, so ignore the blank post
# generated by the split function.
posts = text.split("_" * 80)
posts.pop(len(posts) - 1)
# Extracts the year. The thread will be broken up into
# separate files per year (2013-2017).
year = posts[0].split('\n')[1].split('-')[2]
f = open("output/greg_" + str(year) + ".txt", "w")
print("\nParsing complete. Writing output files...")
print("output/greg_" + str(year) + ".txt")
for post in posts:
post_date = post.strip().split('\n')[1]
# Some files do not have a proper date ("yesterday", "last week").
# Ignore those and consider them as part of the current year.
if len(post_date.split('-')) < 3:
year_next = year
else:
year_next = post_date.split('-')[2]
if year_next != year:
year = year_next
f.close()
f = open("output/greg_" + str(year) + ".txt", "w")
print("output/greg_" + str(year) + ".txt")
# Write the new file and re-insert the separating line.
f.write(post + "_" * 80)
f.close()
print("\nAll done!")
if __name__ == "__main__":
main()
# Calls wget in a shell to get a specified number of pages.
import os
import requests
import json
baseurl = "https://community.lego.com/t5/LEGO-General/Chat-with-Greg-Farshtey/td-p/6605180/page"
def get_page(p):
filename = "scraped/%03d" % p
url = "%s/%d" % (baseurl, p)
os.system("wget \"%s\" -O %s" % (url, filename))
for page in range(1,925):
get_page(page)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment