I hereby claim:
- I am kylegallatin on github.
- I am kylegallatin (https://keybase.io/kylegallatin) on keybase.
- I have a public key ASCqirlcFxpirUwa7dfrcfJYJ6G0I7p7JXb2KKMR5wsYjAo
To claim this, I am signing this object:
I hereby claim:
To claim this, I am signing this object:
x (input) | y (output) | |
---|---|---|
1 | 2 | |
2 | 3 | |
3 | 4 | |
4 | 5 |
import spacy | |
nlp = spacy.load('en_core_web_sm') | |
text = raw_input("Please enter a phrase: ") | |
doc = nlp(unicode(text)) | |
for token in doc: | |
print(token, token.pos_) |
diagnosis | radius | perimeter | |
---|---|---|---|
1 | 17.99 | 122.8 | |
1 | 20.59 | 17.77 | |
1 | 18.25 | 119.6 | |
0 | 13.49 | 86.91 | |
0 | 11.52 | 73.34 | |
0 | 10.17 | 64.55 |
import requests | |
from bs4 import BeautifulSoup as bs | |
# get the web page | |
r = requests.get("http://quotes.toscrape.com") | |
#use beautiful soup to structure the html | |
soup = bs(r.content) | |
# print the soup | |
soup.prettify |
url = "http://quotes.toscrape.com/page/%i/" # url format to follow | |
page = 1 # page numbers for the url | |
stop = False # parameter to stop our crawl when necessary | |
l = [] # list to append scraped info to | |
while stop == False: | |
r = requests.get("http://quotes.toscrape.com/page/%i/" % page) | |
soup = bs(r.content) | |
if soup.find("div", {"class":"quote"}) == None: | |
stop = True |
import requests | |
from bs4 import BeautifulSoup as bs | |
# get the web page | |
r = requests.get("http://quotes.toscrape.com") | |
#use beautiful soup to structure the html | |
soup = bs(r.content) | |
# print the soup | |
soup.prettify |
from pdf2image import convert_from_path | |
import pytesseract | |
def pdf2txt(pdf_path, num_pages=10): | |
##use pdf2image to keep the pages in memory | |
pages = convert_from_path(pdf_path, 500, first_page=0, last_page=num_pages) | |
docs = [] | |
## iterate through each page saving the text and page number | |
for i,page in enumerate(pages): | |
d = {} |
from elasticsearch import Elasticsearch | |
from elasticsearch.helpers import bulk | |
es = Elasticsearch(['localhost:9200']) | |
docs = pdf2txt(pdf_path) | |
index = "mueller-report" | |
##good practice to delete an index if it already exists and you're overwriting |
def search_es(query): | |
res = es.search(index="mueller-report", doc_type="clue", body={ "query" :{ | |
"match": { | |
"text": query | |
} | |
}, | |
}) | |
return res |