This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python2.7 -S | |
import sys | |
sys.setdefaultencoding("utf-8") | |
import site |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> import sys | |
>>> sys.getdefaultencoding() | |
'ascii' | |
>>> sys.getfilesystemencoding() | |
'UTF-8' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
corpus_words = set(map(lambda s: s.strip(),\ | |
codecs.open(file, encoding='Windows‑1252').readlines())) | |
for i in sorted(corpus_words): | |
print i.encode("Windows‑1252") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
file -bi uniq_words_in_corpus.txt | |
#output: text/plain; charset=unknown-8bit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#file to parse: https://dl.dropbox.com/u/18146922/uniq_words_in_corpus.txt | |
def getEncoding(infile): | |
import chardet | |
rawdata = open(infile, "r").read() | |
result = chardet.detect(rawdata) | |
charenc = result['encoding'] | |
print charenc | |
#output: ISO-8859-2. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def identifierSplitByFolder(folderA,folderB): | |
""" usage: identifierSplitByFolder(folderWithJavaFiles,folderWithJavaFilesIdentifierSplit ) """ | |
import re, string, os | |
for root, directory, files in os.walk(folderA): | |
for file in files: | |
absfnA = os.path.join(folderA,file) | |
absfnB = os.path.join(folderB,file) | |
words=open(absfnA).read().replace("\r\n"," ").split(" ") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def createSynsetDict(): | |
import pymysql | |
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='', db='multiwordnet') | |
cur = conn.cursor() | |
syndict={} | |
fp = open("C:\Users\Tathagata\projects\NewTracelabData\EX3\Albergate\AlbergateIdentifierJDKMethods201304040238SplitTransUniqWordsCopy.txt") | |
content = fp.read() | |
words = content.decode("utf-8").lower().split() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
find $1 -type f -print0 | xargs -0 stat --format '%Y :%y %n' | sort -nr | cut -d: -f2- | head |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sed `perl -e "print int rand(99999)"`"q;d" /usr/share/dict/words |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
folder_A=r'''path/to/folder/A''' | |
folder_B=r'''path/to/folder/B''' | |
for root_A, dirnames_A, filenames_A in os.walk(folder_A): | |
for root_B, dirnames_B, filenames_B in os.walk(folder_B): | |
print set(filenames_A) == set(filenames_B) | |
print set(filenames_A) - set(filenames_B) |