Skip to content

Instantly share code, notes, and snippets.

#!/usr/bin/python
# Runs in Pyhton 3
# Processes MSWord files with Tika. Zero exception or error control, use at own risk.
# Assumes lowercase .doc and .docx extensions.
# Output is written to new files with corresponding file exts in the same dir.
import os
import sys
import subprocess
import glob
@JGVerdugo
JGVerdugo / dotika.py
Created February 9, 2014 18:55
Extracts text from multiple documents using Apache Tika
import glob
import os
# USAGE:
# 1. Download the Tika command prompt tool from http://tika.apache.org/download.html.
# 2. Put some files in the same directory.
# 3. Put this script in the same directory (make sure you have Python).
# 4. In the command line, write "python dotika.py".
# If Tika can extract your files, a new file with the extension .new
# will be created for each file matching the "extension" filter (see
@JGVerdugo
JGVerdugo / extract-tei.py
Created April 19, 2013 23:02
Extracting a TEI bilingual dictionary with BeautifulSoup
"""Extracts a TEI bilingual vocabulary to a term-tab-term plain-text structure"""
"""Note: this script uses the BeautifulSoup library for TEI parsing."""
"""See http://www.crummy.com/software/BeautifulSoup/bs4 for details."""
from bs4 import BeautifulSoup
import sys
import codecs
if len(sys.argv) < 2:
@JGVerdugo
JGVerdugo / S_2012_10_Add_20_ES-mod.tmx
Created January 14, 2013 20:17
TMX example file showing proposals for CADT tool
<?xml version="1.0"?><tmx version="1.4">
<header adminlang="EN-GB"
creationtool="CADT TMX Generator"
creationtoolversion="1.0"
datatype="unknown"
o-tmf="TW4Win 2.0 Format"
segtype="sentence"
srclang="EN-GB">
<prop type="Att::Source File">S_2012_10_Add_20_ES-mod</prop>
</header>
@JGVerdugo
JGVerdugo / ziptest.py
Created November 8, 2012 03:38
Learning to use Python zipfile module with docx files
#!/usr/bin/python
import os
import os.path
import zipfile
from datetime import datetime
from re import sub
sourceFile = zipfile.ZipFile('text.docx')
@JGVerdugo
JGVerdugo / punct.py
Created November 8, 2012 03:31
Replaces punctuation with tags
#!/usr/bin/python
import codecs
import os
import re
import unicodedata
# Loads a UTF-8 text file into memory as a character string
def readDoc(filename):
file = codecs.open(filename, "r", "UTF-8")
@JGVerdugo
JGVerdugo / punct.py
Created November 8, 2012 03:30
Replaces punctuation with tags
#!/usr/bin/python
import codecs
import os
import re
import unicodedata
# Loads a UTF-8 text file into memory as a character string
def readDoc(filename):
file = codecs.open(filename, "r", "UTF-8")
@JGVerdugo
JGVerdugo / lemma-count.py
Created October 29, 2012 20:42
Arabic lemma counter
#!/usr/bin/python
#
# Counts Arabic words with 1-10 characters in a text file.
# Counts hapaxes.
# Shadda is assumed to be equal to (1) character.
# All other diacritics and punctuation are discarded.
# Assumes tokenized UTF-8 input.
import sys
import os.path