Skip to content

Instantly share code, notes, and snippets.

@icehongssii
Last active April 13, 2018 02:48
Show Gist options
  • Save icehongssii/26c4985e8dcee52ef489d80cd976886e to your computer and use it in GitHub Desktop.
Save icehongssii/26c4985e8dcee52ef489d80cd976886e to your computer and use it in GitHub Desktop.
list_of_people
from bs4 import BeautifulSoup as bs
from urllib2 import urlopen
from urllib2 import HTTPError
import re
names_list='''
Vitalik Buterin
Joseph Lubin
Max Keiser
Don Tapscott
Marc Andreessen
Adryenn Ashley
Gilles Babinet
Jim Marous
Thomas Power
Charlie Lee
John McAfee
Vinny Lingham
Naval Ravikant
Roger Ver
Brock Pierce
Ran Neuner
Dinis Guarda
Brett King
Andreas M. Antonopoulos
Erik Voorhees
Sally Eaves
Ross Gerber
Jackson Palmer
Tiff Hayden
Bruce Porter Jr.
Stacy Herbert
Warren Whitlock
Brian Armstrong
Tuur Demeester
Simon Dixon
Adam Back
Barry Silbert
Emin Gün Sirer
Meltem Demirors
Nick Szabo
Jimmy Song
Jeff Berwick
Laura Shin
Simon Cocking
Sebastian Meunier
Elizabeth Stark
Chris Skinner
Daniel Roberts
Nathaniel Popper
Michael Parsons
Mark Van Rijmenam
Michael Casey
Jihan Wu
Safaraz Ali
Michael Terpin
Antonio Selas
David G.W. Birch
William Mougayar
Oliver Bussmann
David M. Brear
Prof Bill Buchanan OBE
Simon Taylor
Dr. Julian Hosp
Pierre Tran
Gerald Celente
Alejandro De La Torre
Gavin Andresen
Neha Narula
Sydes Jokes
Alexander Tapscott
Charlotte Halkett
Camila Russo
Michael Novogratz
Jose Pagliery
Jeff Garzik
Gary Nuttall
Takashi Mochizuki
Sunny Ray
Fred Ehrsam
Jon Matonis
Michael Gastauer
Anthony Diiorio
Kumar Gaurav
Diana Biggs
Susanne Chishti
Anne Connelly
Wences Cesares
Sir Mark Walport
Hansjörg Leichsenring
Emily Spaven
Kathryn Haun
Ismail Malik
Gavin Wood
Rob Gill
Vinay Gupta
Tony Gallippi
Toni Lane
Janina Lowisz
Paolo Tasca
Michael Mainelli
Kristian T. Sorensen
Dean Demellweek
Valery Vavilov
Melanie Swan
Tim Draper
'''
#divide the string by "\n"
list=(names_list.strip()).split("\n")
#if there are same names
def same_name_check(body):
flag=True
list=body.find("div",{"id":"mw-content-text"})
if "refer" in list.find("p").text:
flag=False
return flag
#first sentence from first paragraph
def get_first_sentence(my_string):
linking_verbs = set(['was', 'is', 'are', 'were'])
split_string = my_string.split(' ')
first_sentence = []
linked_verb_booly = False
for ele in split_string:
first_sentence.append(ele)
if ele in linking_verbs:
linked_verb_booly = True
if '.' in ele and linked_verb_booly == True:
break
return ' '.join(first_sentence)
#if there are same naems, return the right one's href related to cryptocurrency
#if not, returns "\n"
def same_name_href(body):
keywords=['cryptocurrency','bitcoin''crypto']
key_name=""
same_name=[]
lis=(body.find("div",{"id":"mw-content-text"}).findAll("li"))
for hrefs in lis: #get the list of same name people's href
if "/wiki/" in hrefs.a['href']:
same_name.append(hrefs.a['href'])
for name in same_name:
html=urlopen("https://en.wikipedia.org"+name)
same_name_body=bs(html)
info=same_name_body.find("p").text
for keyword in keywords: #filter to check "crpyto"
if keyword in info:
key_name=name.split("/wiki/")[1]
return key_name
#to get all the data
big_data=[]
for name in list:
json_data={}
career_text=""
try:
name=name.replace(" ","_")
html=urlopen("http://en.wikipedia.org/wiki/"+name)
except HTTPError as e: #no such page
json_data['name']=name
json_data['des']="Missing"
json_data['career']="Missing"
big_data.append(json_data)
pass
else:
body=bs(html)
if not same_name_check(body) :
single_name=same_name_href(body)
if (single_name) == "":#found people but not related to crpytocurrency
json_data['name']=name
json_data['des']="Missing"
json_data['career']="Missing"
continue
else:
html=urlopen("http://en.wikipedia.org/wiki/"+single_name)
body=bs(html)
name=body.find("h1",{"id":"firstHeading"})
if name.text in "(": #e.g)"delete () parts from "Joseph Lubin(teacher)"
name=name.text.split("(")[0]
des=get_first_sentence((body.find("div",{"id":"mw-content-text"}).find("p").getText())) #fist sentence from first paragraph
json_data['des']=des
big_data.append(json_data)
careers=body.findAll("span",{"id":re.compile("career",re.I)}) #find career section
if len(careers)==0: #can't find career section
json_data['career']="Missing"
career_text="Missing"
else:
for career in careers: #possibility to have more than one career sections
siblings=career.parent.next_siblings
for brother in siblings:
if brother.name==None:
pass
elif brother.name=="h2":
break
else:
tmp=(brother.text.encode('utf-8'))
career_text+=tmp
json_data['career']=career_text
print((name.text))
print(des.strip())
print(career_text)
print("================")
big_data.append(json_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment