Last active
May 9, 2016 09:34
-
-
Save arwer13/13ad705ddd7e52759fc10308fc98d7eb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# wordDB.py | |
# Word Frequency Database Generator | |
# Author: Ryan McCarl | |
# License: GPL 3 | |
import sqlite3 | |
import nltk as n | |
from nltk.corpus import brown | |
from nltk.stem import WordNetLemmatizer as wordnet | |
corpus = [] | |
table_name = "frequencies" | |
def makeDbPath(lang): | |
dbName = r"C:\PythonDatabases\%s.db" % lang | |
return dbName | |
def connectDb(dbName): | |
conn = sqlite3.connect(dbName) | |
c = conn.cursor() | |
return c | |
def makeTables(c, lang): | |
c.execute( | |
"CREATE TABLE IF NOT EXISTS " + table_name + " " + | |
"(id INTEGER PRIMARY KEY, freqRank INTEGER, lemma TEXT)" | |
) | |
query = "INSERT INTO " + table_name + " VALUES (?, ?, ?)" | |
testWord = "Hello" # for debugging | |
values = (1, 1, testWord) | |
c.execute(query, values) | |
# c.fetchone().print() # for debugging) | |
def insert_into_frequencies(c, words_info): | |
query = "INSERT INTO {} (lemma, freqRank) VALUES (?, ?)".format(table_name) | |
c.executemany(query, words_info) | |
def loadCorpus(): | |
global corpus | |
corpus = n.corpus.brown | |
def GetFreq(corpus): | |
# global FreqList = [] | |
words = corpus.words() | |
freqList = n.FreqDist(w.lower() for w in words) | |
freqList = freqList.most_common(500) # Get list of the 500 most common words | |
# print(freqList) # for debugging | |
return freqList | |
def main(): | |
lang = raw_input("What language would you like to create a database for?\n") | |
# lang = 'eng' | |
loadCorpus() | |
db_path = makeDbPath(lang) | |
cursor = connectDb(db_path) | |
makeTables(cursor, lang) | |
most_common = GetFreq(corpus) | |
insert_into_frequencies(cursor, most_common) | |
cursor.execute('SELECT COUNT(*) FROM {}'.format(table_name)) | |
print(cursor.fetchall()) | |
cursor.close() | |
main() |
Note, that CamelCase (so it's called) variables and functions naming is not a pythonic way, as PEP-8 defines:
https://www.python.org/dev/peps/pep-0008/#function-names
https://www.python.org/dev/peps/pep-0008/#method-names-and-instance-variables
In Python language the convention is commonly accepted by whole community.
But, for example, this is not true for C++, where there is no prevalent coding style.
This issues might not be important at the beginning of learning, but anyway I'd like to advise to stick to it from the beginning, at least just to make good habit.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On
global
keyword cited from hereYou can use a global variable in other functions by declaring it as global in each function that assigns to it:
I imagine the reason for it is that, since global variables are so dangerous, Python wants to make sure that you really know that's what you're playing with by explicitly requiring the global keyword.