Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@benosteen
benosteen / altototxt.py
Created March 3, 2017 16:37
Quick and dirty ALTO XML to txt
#python3
import re, os
text_p = re.compile(r"CONTENT=\"([^\"]*)\"", re.U)
line_p = re.compile(r"</TextLine>", re.U)
def get_text(alto_filepath):
current = ""
text_content = ""
words = []
# NB this will flatten the data. Some fields (author, pdf, imgs) have extra data that will be lost if you include them in this data.
# the "author" field has a variety of nuances that will be lost for example (creator, editor, etc)
# Intended for use with https://dx.doi.org/10.21250/DB21
# MIT Licence 2016
import json, csv
EXPORTFILENAME = "book_data.csv"
FIELDS = ['datefield', 'shelfmarks', 'title', 'publisher', 'edition', 'flickr_url_to_book_images', 'place', 'issuance',
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@benosteen
benosteen / gist:822fef58c6c92b120fff
Created January 21, 2015 14:44
Pulling metadata together from id3 tag XML files
import os
from xml.etree import ElementTree as ET
import csv
OUTPUT = "/path/to/output.csv"
PATH = "/path/to/root/folder/that/has/all/the/xmls"
HEADERS = ['ALBUM', 'TITLE', 'ARTIST', 'GENRE', 'TRACKNUMBER', 'COMMENTS', 'YEAR', 'id3v2', 'PATH', 'FROM_FILENAME', 'id3v2']
# Assuming UTF-8...
import csv
import json
INPUTFILE = "History_Journal_Articles_KW.csv"
OUTPUTFILE = INPUTFILE[:-4] + "_numbered.csv"
in_file = open(INPUTFILE, "r") # "r" == Open file for reading
out_file = open(OUTPUTFILE, "w") # "w" for writing
for zipfile in `ls *.zip`
do
echo "Unpacking $zipfile"
unzip $zipfile
echo "Attempting to add ${zipfile%.*} to HDFS directory 'BNB'"
hadoop fs -copyFromLocal ${zipfile%.*} BNB/${zipfile%.*}
echo "Removing unpacked ${zipfile%.*} from local directory"
rm ${zipfile%.*}
done
0xFFFFFFFFFFFFFFFFFFFFFFFF13002cd712a92f022957058072afFFFFFFFFFFFF
0xFFFFFFFFFFFFFFFFFFFFFFFF1300002a774a6c02295705804824FFFFFFFFFFFF
0xFFFFFFFFFFFFFFFFFFFFFFFF13002cd7719d06032957058072afFFFFFFFFFFFF
0xFFFFFFFFFFFFFFFFFFFFFFFF1300002ae7f04203295705804824FFFFFFFFFFFF
0xFFFFFFFFFFFFFFFFFFFFFFFF13006953dd884103295705805f91FFFFFFFFFFFF
0xFFFFFFFFFFFFFFFFFFFFFFFF1300002a2cd2b303295705804824FFFFFFFFFFFF
0xFFFFFFFFFFFFFFFFFFFFFFFF130069538c8eb703295705805f91FFFFFFFFFFFF
0xFFFFFFFFFFFFFFFFFFFFFFFF13004ae23fb72004295705803d6dFFFFFFFFFFFF
0xFFFFFFFFFFFFFFFFFFFFFFFF130018bf5d2d2004295705806785FFFFFFFFFFFF
0xFFFFFFFFFFFFFFFFFFFFFFFF13006953fbe23504295705805f91FFFFFFFFFFFF
def bars(html_filename, list_of_stuff):
with open(html_filename, "w") as htmlfile:
htmlfile.write("<html><head><style>.bar { width: 100%; height: 0.3em; } </style></head><body>")
for instance_number in list_of_stuff:
htmlfile.write('<div class="bar" style="background-color: #{0};">&nbsp;</div>\n'.format(instance_number))
htmlfile.write("</body></html>")
def blocks(html_filename, list_of_stuff):
with open(html_filename, "w") as htmlfile:
htmlfile.write("<html><head><style>.block { width: 0.3em; height: 0.3em; float:left; } </style></head><body>")
#!/usr/bin/env python
USER = "benosteen"
CACHE_FILE = "data.json"
rating_t ="""http://www.boardgamegeek.com/xmlapi/collection/%s?rated=1"""
weight_t = """http://www.boardgamegeek.com/xmlapi/boardgame/%s?stats=1"""
import requests
from xml.etree import ElementTree as ET
@benosteen
benosteen / console.log
Created April 30, 2012 22:25
Log of music playback - sent, retrieved and silence bytes
modprobe snc_bcm2835
====================
Apr 30 23:05:22 raspberrypi kernel: ### snd_bcm2835_alsa_probe c039e448 ############### PROBING FOR bcm2835 ALSA device (0):(1) ###############
Apr 30 23:05:22 raspberrypi kernel: Creating card...
Apr 30 23:05:22 raspberrypi kernel: Creating device/chip ..
Apr 30 23:05:22 raspberrypi kernel: Adding controls ..
Apr 30 23:05:22 raspberrypi kernel: Registering card ....
Apr 30 23:05:22 raspberrypi kernel: bcm2835 ALSA CARD CREATED!
Apr 30 23:05:22 raspberrypi kernel: ### BCM2835 ALSA driver init OK ###