David Rosson gartenfeld

## flat_to_mongo.py
import sys
import re
import codecs # UniCode support
from pymongo import Connection # For DB Connection
from pymongo.errors import ConnectionFailure # For catching exeptions

def main():

  # MongoDB connection
  try:

## star_files.py
def main():
  # Command-line parsing supports filename*.txt
  # Make a list of command line arguments, omitting the [0] element which is the script itself.
  args = sys.argv[1:]

  if not args:
    print 'Some message.'
    sys.exit(1)

  for filename in args:

## name-scraper.py
from bs4 import BeautifulSoup
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support

def scrape(page):
  # Dump raw HTML into Soup
  raw_data = codecs.open(page, 'r', encoding='utf-8').read()
	soup = BeautifulSoup(raw_data)

## exact_class.py
soup(lambda tag: tag.name == 'div' and tag.get('class') == ['some-class'])

## random_file.py
import os, random
random.choice(os.listdir("INSERT-DIR"))

## load_dir.py
# Non-recursive
import os
def load_directory(data_path):

	files_list = []
	try:
		for file_name in os.listdir(data_path):
			if file_name.endswith(".html"):
				files_list.append(file_name)

## unicode_wrapper.py
	file_header = "<html>\n<head>\n<meta charset='utf-8'>\n</head>\n<body>\n"
	file_footer = "\n</body>\n</html>"

## lcsub.py
def longest_common_substring(s1, s2):
  m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
  longest, x_up_to = 0, 0
  for x in range(1, 1 + len(s1)):
    for y in range(1, 1 + len(s2)): # match every char in s2 against every char in s1
      if s1[x - 1] == s2[y - 1]: # record a char match
        m[x][y] = m[x - 1][y - 1] + 1 # char match tally will accumulate if previous char also matched
        if m[x][y] > longest:
          longest = m[x][y]
          x_up_to = x # record char position of last found match

## arpabet.txt
AA	ɑ
AA0	ɑ
AA1	ɑ
AA2	ɑ
AE	æ
AE0	æ
AE1	æ
AE2	æ
AH	ə
AH0	ə

## triage_fi_en.py
from bs4 import BeautifulSoup
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
import os

def clear_output_file(out_file):
	file_header ="""<html>
<head>
	import sys
	import re
	import codecs # UniCode support
	from pymongo import Connection # For DB Connection
	from pymongo.errors import ConnectionFailure # For catching exeptions

	def main():

	# MongoDB connection
	try:
	def main():
	# Command-line parsing supports filename*.txt
	# Make a list of command line arguments, omitting the [0] element which is the script itself.
	args = sys.argv[1:]

	if not args:
	print 'Some message.'
	sys.exit(1)

	for filename in args:
	from bs4 import BeautifulSoup
	import re # Regular Expressions
	import collections # Data Types
	import sys # File operations
	import codecs # UniCode support

	def scrape(page):
	# Dump raw HTML into Soup
	raw_data = codecs.open(page, 'r', encoding='utf-8').read()
	soup = BeautifulSoup(raw_data)
	# Non-recursive
	import os
	def load_directory(data_path):

	files_list = []
	try:
	for file_name in os.listdir(data_path):
	if file_name.endswith(".html"):
	files_list.append(file_name)
	file_header = "<html>\n<head>\n<meta charset='utf-8'>\n</head>\n<body>\n"
	file_footer = "\n</body>\n</html>"
	def longest_common_substring(s1, s2):
	m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
	longest, x_up_to = 0, 0
	for x in range(1, 1 + len(s1)):
	for y in range(1, 1 + len(s2)): # match every char in s2 against every char in s1
	if s1[x - 1] == s2[y - 1]: # record a char match
	m[x][y] = m[x - 1][y - 1] + 1 # char match tally will accumulate if previous char also matched
	if m[x][y] > longest:
	longest = m[x][y]
	x_up_to = x # record char position of last found match