Skip to content

Instantly share code, notes, and snippets.

@aniemerg
Created August 19, 2012 22:58
Show Gist options
  • Save aniemerg/3398373 to your computer and use it in GitHub Desktop.
Save aniemerg/3398373 to your computer and use it in GitHub Desktop.
# DescLength()
# Counts the length of patent descriptions
import MySQLdb as mdb
import sys
import datetime
import re
import math
import time
import numpy as np
def DescLength(TrimTop20 = False):
#Get Patents from database
con = mdb.connect('localhost', 'username', 'password', 'database');
with con:
cur = con.cursor()
cur.execute("SELECT description from USPatents LIMIT 100")
rows = cur.fetchall()
desclengths = []
#count words in each patent
for x in rows:
ws = re.findall(r'\w+', x[0])
words = len(ws)
desclengths.append(words)
#chop off top 20%
if TrimTop20:
desclengths = sorted(desclengths)
desclengths = desclengths[1:int(len(desclengths)*.8)]
#create histogram
nplengths = np.array(desclengths)
hist=np.histogram(nplengths,bins=np.linspace(min(desclengths)-1,max(desclengths)+1,num=30))
#output to file
thetime = time.strftime("%Y%m%d_%S")
outputfile = "DescLen"+ thetime + ".txt"
outfile = open(outputfile, 'w')
for result in zip(hist[1],hist[0]):
line = "[\'%s\', %s],\n" % (int(result[0]), result[1])
outfile.write(line)
outfile.close()
if __name__=="__main__":
DescLength()
@aniemerg
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment