Skip to content

Instantly share code, notes, and snippets.

View kylegallatin's full-sized avatar

Kyle Gallatin kylegallatin

View GitHub Profile
@kylegallatin
kylegallatin / keybase.md
Created July 9, 2018 13:27
Keybase verification

Keybase proof

I hereby claim:

  • I am kylegallatin on github.
  • I am kylegallatin (https://keybase.io/kylegallatin) on keybase.
  • I have a public key ASCqirlcFxpirUwa7dfrcfJYJ6G0I7p7JXb2KKMR5wsYjAo

To claim this, I am signing this object:

x (input) y (output)
1 2
2 3
3 4
4 5
import spacy
nlp = spacy.load('en_core_web_sm')
text = raw_input("Please enter a phrase: ")
doc = nlp(unicode(text))
for token in doc:
print(token, token.pos_)
diagnosis radius perimeter
1 17.99 122.8
1 20.59 17.77
1 18.25 119.6
0 13.49 86.91
0 11.52 73.34
0 10.17 64.55
import requests
from bs4 import BeautifulSoup as bs
# get the web page
r = requests.get("http://quotes.toscrape.com")
#use beautiful soup to structure the html
soup = bs(r.content)
# print the soup
soup.prettify
url = "http://quotes.toscrape.com/page/%i/" # url format to follow
page = 1 # page numbers for the url
stop = False # parameter to stop our crawl when necessary
l = [] # list to append scraped info to
while stop == False:
r = requests.get("http://quotes.toscrape.com/page/%i/" % page)
soup = bs(r.content)
if soup.find("div", {"class":"quote"}) == None:
stop = True
import requests
from bs4 import BeautifulSoup as bs
# get the web page
r = requests.get("http://quotes.toscrape.com")
#use beautiful soup to structure the html
soup = bs(r.content)
# print the soup
soup.prettify
from pdf2image import convert_from_path
import pytesseract
def pdf2txt(pdf_path, num_pages=10):
##use pdf2image to keep the pages in memory
pages = convert_from_path(pdf_path, 500, first_page=0, last_page=num_pages)
docs = []
## iterate through each page saving the text and page number
for i,page in enumerate(pages):
d = {}
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
es = Elasticsearch(['localhost:9200'])
docs = pdf2txt(pdf_path)
index = "mueller-report"
##good practice to delete an index if it already exists and you're overwriting
def search_es(query):
res = es.search(index="mueller-report", doc_type="clue", body={ "query" :{
"match": {
"text": query
}
},
})
return res