Kyle Gallatin kylegallatin

## keybase.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                kylegallatin
                / keybase.md
            
            
              Created
              July 9, 2018 13:27
            
              
                Keybase verification
              
          
    Keybase proof

I hereby claim:

I am kylegallatin on github.
I am kylegallatin (https://keybase.io/kylegallatin) on keybase.
I have a public key ASCqirlcFxpirUwa7dfrcfJYJ6G0I7p7JXb2KKMR5wsYjAo

To claim this, I am signing this object:

  
## stupid_table.csv

          
            x (input)
            y (output)

            
              1
              2

            
              2
              3

            
              3
              4

            
              4
              5

## app.py
import spacy

nlp = spacy.load('en_core_web_sm')
text = raw_input("Please enter a phrase: ")
doc = nlp(unicode(text))

for token in doc:
    print(token, token.pos_)

## cancer_data.csv

          
            diagnosis
            radius
            perimeter

            
              1
              17.99
              122.8

            
              1
              20.59
              17.77

            
              1
              18.25
              119.6

            
              0
              13.49
              86.91

            
              0
              11.52
              73.34

            
              0
              10.17
              64.55

## request_html.py
import requests
from bs4 import BeautifulSoup as bs

# get the web page
r = requests.get("http://quotes.toscrape.com")
#use beautiful soup to structure the html
soup = bs(r.content)
# print the soup
soup.prettify

## easy_scraping.py
url = "http://quotes.toscrape.com/page/%i/" # url format to follow
page = 1 # page numbers for the url
stop = False # parameter to stop our crawl when necessary
l = [] # list to append scraped info to

while stop == False:
    r = requests.get("http://quotes.toscrape.com/page/%i/" % page)
    soup = bs(r.content)
    if soup.find("div", {"class":"quote"}) == None:
        stop = True

## request_a_webpage.py
import requests
from bs4 import BeautifulSoup as bs
# get the web page
r = requests.get("http://quotes.toscrape.com")
#use beautiful soup to structure the html
soup = bs(r.content)
# print the soup
soup.prettify

## pdf2text.py
from pdf2image import convert_from_path
import pytesseract

def pdf2txt(pdf_path, num_pages=10):
    ##use pdf2image to keep the pages in memory
    pages = convert_from_path(pdf_path, 500, first_page=0, last_page=num_pages)
    docs = []
    ## iterate through each page saving the text and page number
    for i,page in enumerate(pages):
        d = {}

## index.py
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es = Elasticsearch(['localhost:9200'])

docs = pdf2txt(pdf_path)

index = "mueller-report"

##good practice to delete an index if it already exists and you're overwriting

## search_es.py
def search_es(query):
    res = es.search(index="mueller-report", doc_type="clue", body={ "query" :{
        "match": {
            "text": query
            }
        },
    })
    return res
	import spacy

	nlp = spacy.load('en_core_web_sm')
	text = raw_input("Please enter a phrase: ")
	doc = nlp(unicode(text))

	for token in doc:
	print(token, token.pos_)
diagnosis	radius	perimeter
1	17.99	122.8
1	20.59	17.77
1	18.25	119.6
0	13.49	86.91
0	11.52	73.34
0	10.17	64.55
	import requests
	from bs4 import BeautifulSoup as bs

	# get the web page
	r = requests.get("http://quotes.toscrape.com")
	#use beautiful soup to structure the html
	soup = bs(r.content)
	# print the soup
	soup.prettify
	url = "http://quotes.toscrape.com/page/%i/" # url format to follow
	page = 1 # page numbers for the url
	stop = False # parameter to stop our crawl when necessary
	l = [] # list to append scraped info to

	while stop == False:
	r = requests.get("http://quotes.toscrape.com/page/%i/" % page)
	soup = bs(r.content)
	if soup.find("div", {"class":"quote"}) == None:
	stop = True
	from pdf2image import convert_from_path
	import pytesseract

	def pdf2txt(pdf_path, num_pages=10):
	##use pdf2image to keep the pages in memory
	pages = convert_from_path(pdf_path, 500, first_page=0, last_page=num_pages)
	docs = []
	## iterate through each page saving the text and page number
	for i,page in enumerate(pages):
	d = {}
	from elasticsearch import Elasticsearch
	from elasticsearch.helpers import bulk

	es = Elasticsearch(['localhost:9200'])

	docs = pdf2txt(pdf_path)

	index = "mueller-report"

	##good practice to delete an index if it already exists and you're overwriting
	def search_es(query):
	res = es.search(index="mueller-report", doc_type="clue", body={ "query" :{
	"match": {
	"text": query
	}
	},
	})
	return res