Skip to content

Instantly share code, notes, and snippets.

View gartenfeld's full-sized avatar

David Rosson gartenfeld

View GitHub Profile
@gartenfeld
gartenfeld / flat_to_mongo.py
Last active April 17, 2020 05:35
Import data from a flat file into MongoDB.
import sys
import re
import codecs # UniCode support
from pymongo import Connection # For DB Connection
from pymongo.errors import ConnectionFailure # For catching exeptions
def main():
# MongoDB connection
try:
@gartenfeld
gartenfeld / star_files.py
Last active December 12, 2015 02:39
Processing multiple files.
def main():
# Command-line parsing supports filename*.txt
# Make a list of command line arguments, omitting the [0] element which is the script itself.
args = sys.argv[1:]
if not args:
print 'Some message.'
sys.exit(1)
for filename in args:
@gartenfeld
gartenfeld / name-scraper.py
Created February 23, 2013 05:50
Scraping names of participants in a class from a raw HTML dump.
from bs4 import BeautifulSoup
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
def scrape(page):
# Dump raw HTML into Soup
raw_data = codecs.open(page, 'r', encoding='utf-8').read()
soup = BeautifulSoup(raw_data)
@gartenfeld
gartenfeld / exact_class.py
Created July 10, 2014 06:07
Matching exact class attribute.
soup(lambda tag: tag.name == 'div' and tag.get('class') == ['some-class'])
@gartenfeld
gartenfeld / random_file.py
Created July 10, 2014 06:11
Choose random file in directory.
import os, random
random.choice(os.listdir("INSERT-DIR"))
@gartenfeld
gartenfeld / load_dir.py
Created July 12, 2014 02:30
Load directory.
# Non-recursive
import os
def load_directory(data_path):
files_list = []
try:
for file_name in os.listdir(data_path):
if file_name.endswith(".html"):
files_list.append(file_name)
@gartenfeld
gartenfeld / unicode_wrapper.py
Last active August 29, 2015 14:04
Unicode HTML wrapper
file_header = "<html>\n<head>\n<meta charset='utf-8'>\n</head>\n<body>\n"
file_footer = "\n</body>\n</html>"
@gartenfeld
gartenfeld / lcsub.py
Last active August 29, 2015 14:08
Find longest common substring.
def longest_common_substring(s1, s2):
m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
longest, x_up_to = 0, 0
for x in range(1, 1 + len(s1)):
for y in range(1, 1 + len(s2)): # match every char in s2 against every char in s1
if s1[x - 1] == s2[y - 1]: # record a char match
m[x][y] = m[x - 1][y - 1] + 1 # char match tally will accumulate if previous char also matched
if m[x][y] > longest:
longest = m[x][y]
x_up_to = x # record char position of last found match
@gartenfeld
gartenfeld / arpabet.txt
Last active July 26, 2022 11:44
Frequency distribution of syllables using CMU dictionary and COCA.
AA ɑ
AA0 ɑ
AA1 ɑ
AA2 ɑ
AE æ
AE0 æ
AE1 æ
AE2 æ
AH ə
AH0 ə
@gartenfeld
gartenfeld / triage_fi_en.py
Created November 1, 2014 11:11
Cleaning up and separating lexical data files.
from bs4 import BeautifulSoup
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
import os
def clear_output_file(out_file):
file_header ="""<html>
<head>