Skip to content

Instantly share code, notes, and snippets.

@mtairu
Created December 5, 2016 06:33
Show Gist options
  • Save mtairu/70a108bb5e6de11e044eb4e3aeb7621e to your computer and use it in GitHub Desktop.
Save mtairu/70a108bb5e6de11e044eb4e3aeb7621e to your computer and use it in GitHub Desktop.
import pandas as pd
import requests as re
import sqlite3 as sql
from wordsegment import unigram_counts as uc
#globals
con = sql.connect("C:\\Users\\win10\\YandexDisk\\apps\\flask\\new_file.sqlite")
df0 = pd.read_sql("SELECT * from do_1", con, index_col='host')
df = df0.copy()
wordniklist = []
deldomains = []
def wordnik():
for i in df.Dnsg.tolist():
word = i.replace('-',' ').split()
wordniklist.append(word)
wordnik()
df['Dword'] = wordniklist
deldomains = []
print(len(df))
def filter_3():
for i in df.Dword.tolist():
wrq0 = re.get("http://api.wordnik.com:80/v4/words.json/reverseDictionary?query="+i[0]+"&minCorpusCount=5&maxCorpusCount=-1&minLength=1&maxLength=-1&includeTags=false&skip=0&limit=10&api_key=8c29c2f3490107a0080030f3ddb048a7a8c65296f1a078e9a").json()
wrq1 = re.get("http://api.wordnik.com:80/v4/words.json/reverseDictionary?query="+i[1]+"&minCorpusCount=5&maxCorpusCount=-1&minLength=1&maxLength=-1&includeTags=false&skip=0&limit=10&api_key=8c29c2f3490107a0080030f3ddb048a7a8c65296f1a078e9a").json()
wrs0 = wrq0.get("totalResults")
wrs1 = wrq1.get("totalResults")
if wrs0 == 0 or wrs0 is None:
print("appended " + i[0])
tempdomains = "".join(i)
deldomains.append(tempdomains)
else:
pass
print(i[0] + " is valid")
if wrs1 == 0 or wrs1 is None:
print('appended ' + i[1])
tempdomains = "".join(i)
deldomains.append(tempdomains)
else:
pass
print(i[1] + " is valid")
filter_3()
dfScrubbed = df.drop(deldomains)
print ("dropping domains ......")
dfScrubbed.to_csv('wordniklist.csv')
dfScrubbed.head()
df.drop('Dword', axis=1)
df.drop('Dnsg', axis=1)
print(len(dfScrubbed))
conn = sql.connect('C:\\Users\\win10\\YandexDisk\\apps\\flask\\new_file.sqlite')
print ("writing to db as do_2")
dfScrubbed.to_sql(con=conn,name='do_2', if_exists='replace')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment