JGVerdugo

## tika-extract.py
#!/usr/bin/python
# Runs in Pyhton 3
# Processes MSWord files with Tika. Zero exception or error control, use at own risk.
# Assumes lowercase .doc and .docx extensions.
# Output is written to new files with corresponding file exts in the same dir.

import os
import sys
import subprocess
import glob

## dotika.py
import glob
import os

# USAGE:
# 1. Download the Tika command prompt tool from http://tika.apache.org/download.html.
# 2. Put some files in the same directory.
# 3. Put this script in the same directory (make sure you have Python).
# 4. In the command line, write "python dotika.py".
# If Tika can extract your files, a new file with the extension .new
# will be created for each file matching the "extension" filter (see

## extract-tei.py
"""Extracts a TEI bilingual vocabulary to a term-tab-term plain-text structure"""

"""Note: this script uses the BeautifulSoup library for TEI parsing."""
"""See http://www.crummy.com/software/BeautifulSoup/bs4 for details."""

from bs4 import BeautifulSoup
import sys
import codecs

if len(sys.argv) < 2:

## S_2012_10_Add_20_ES-mod.tmx
<?xml version="1.0"?><tmx version="1.4">
  <header adminlang="EN-GB"
      creationtool="CADT TMX Generator"
  	  creationtoolversion="1.0"
		  datatype="unknown"
		  o-tmf="TW4Win 2.0 Format"
		  segtype="sentence"
		  srclang="EN-GB">
    <prop type="Att::Source File">S_2012_10_Add_20_ES-mod</prop>
  </header>

## ziptest.py
#!/usr/bin/python

import os
import os.path
import zipfile
from datetime import datetime
from re import sub

sourceFile = zipfile.ZipFile('text.docx')

## punct.py
#!/usr/bin/python

import codecs
import os
import re
import unicodedata

# Loads a UTF-8 text file into memory as a character string
def readDoc(filename):
	file = codecs.open(filename, "r", "UTF-8")

## punct.py
#!/usr/bin/python

import codecs
import os
import re
import unicodedata

# Loads a UTF-8 text file into memory as a character string
def readDoc(filename):
	file = codecs.open(filename, "r", "UTF-8")

## lemma-count.py
#!/usr/bin/python
#
# Counts Arabic words with 1-10 characters in a text file.
# Counts hapaxes.
# Shadda is assumed to be equal to (1) character.
# All other diacritics and punctuation are discarded.
# Assumes tokenized UTF-8 input.

import sys
import os.path
	#!/usr/bin/python
	# Runs in Pyhton 3
	# Processes MSWord files with Tika. Zero exception or error control, use at own risk.
	# Assumes lowercase .doc and .docx extensions.
	# Output is written to new files with corresponding file exts in the same dir.

	import os
	import sys
	import subprocess
	import glob
	import glob
	import os

	# USAGE:
	# 1. Download the Tika command prompt tool from http://tika.apache.org/download.html.
	# 2. Put some files in the same directory.
	# 3. Put this script in the same directory (make sure you have Python).
	# 4. In the command line, write "python dotika.py".
	# If Tika can extract your files, a new file with the extension .new
	# will be created for each file matching the "extension" filter (see
	"""Extracts a TEI bilingual vocabulary to a term-tab-term plain-text structure"""

	"""Note: this script uses the BeautifulSoup library for TEI parsing."""
	"""See http://www.crummy.com/software/BeautifulSoup/bs4 for details."""

	from bs4 import BeautifulSoup
	import sys
	import codecs

	if len(sys.argv) < 2:
	<?xml version="1.0"?><tmx version="1.4">
	<header adminlang="EN-GB"
	creationtool="CADT TMX Generator"
	creationtoolversion="1.0"
	datatype="unknown"
	o-tmf="TW4Win 2.0 Format"
	segtype="sentence"
	srclang="EN-GB">
	<prop type="Att::Source File">S_2012_10_Add_20_ES-mod</prop>
	</header>
	#!/usr/bin/python

	import os
	import os.path
	import zipfile
	from datetime import datetime
	from re import sub

	sourceFile = zipfile.ZipFile('text.docx')
	#!/usr/bin/python
	#
	# Counts Arabic words with 1-10 characters in a text file.
	# Counts hapaxes.
	# Shadda is assumed to be equal to (1) character.
	# All other diacritics and punctuation are discarded.
	# Assumes tokenized UTF-8 input.

	import sys
	import os.path