Last active
April 13, 2018 02:48
-
-
Save icehongssii/26c4985e8dcee52ef489d80cd976886e to your computer and use it in GitHub Desktop.
list_of_people
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as bs | |
from urllib2 import urlopen | |
from urllib2 import HTTPError | |
import re | |
names_list=''' | |
Vitalik Buterin | |
Joseph Lubin | |
Max Keiser | |
Don Tapscott | |
Marc Andreessen | |
Adryenn Ashley | |
Gilles Babinet | |
Jim Marous | |
Thomas Power | |
Charlie Lee | |
John McAfee | |
Vinny Lingham | |
Naval Ravikant | |
Roger Ver | |
Brock Pierce | |
Ran Neuner | |
Dinis Guarda | |
Brett King | |
Andreas M. Antonopoulos | |
Erik Voorhees | |
Sally Eaves | |
Ross Gerber | |
Jackson Palmer | |
Tiff Hayden | |
Bruce Porter Jr. | |
Stacy Herbert | |
Warren Whitlock | |
Brian Armstrong | |
Tuur Demeester | |
Simon Dixon | |
Adam Back | |
Barry Silbert | |
Emin Gün Sirer | |
Meltem Demirors | |
Nick Szabo | |
Jimmy Song | |
Jeff Berwick | |
Laura Shin | |
Simon Cocking | |
Sebastian Meunier | |
Elizabeth Stark | |
Chris Skinner | |
Daniel Roberts | |
Nathaniel Popper | |
Michael Parsons | |
Mark Van Rijmenam | |
Michael Casey | |
Jihan Wu | |
Safaraz Ali | |
Michael Terpin | |
Antonio Selas | |
David G.W. Birch | |
William Mougayar | |
Oliver Bussmann | |
David M. Brear | |
Prof Bill Buchanan OBE | |
Simon Taylor | |
Dr. Julian Hosp | |
Pierre Tran | |
Gerald Celente | |
Alejandro De La Torre | |
Gavin Andresen | |
Neha Narula | |
Sydes Jokes | |
Alexander Tapscott | |
Charlotte Halkett | |
Camila Russo | |
Michael Novogratz | |
Jose Pagliery | |
Jeff Garzik | |
Gary Nuttall | |
Takashi Mochizuki | |
Sunny Ray | |
Fred Ehrsam | |
Jon Matonis | |
Michael Gastauer | |
Anthony Diiorio | |
Kumar Gaurav | |
Diana Biggs | |
Susanne Chishti | |
Anne Connelly | |
Wences Cesares | |
Sir Mark Walport | |
Hansjörg Leichsenring | |
Emily Spaven | |
Kathryn Haun | |
Ismail Malik | |
Gavin Wood | |
Rob Gill | |
Vinay Gupta | |
Tony Gallippi | |
Toni Lane | |
Janina Lowisz | |
Paolo Tasca | |
Michael Mainelli | |
Kristian T. Sorensen | |
Dean Demellweek | |
Valery Vavilov | |
Melanie Swan | |
Tim Draper | |
''' | |
#divide the string by "\n" | |
list=(names_list.strip()).split("\n") | |
#if there are same names | |
def same_name_check(body): | |
flag=True | |
list=body.find("div",{"id":"mw-content-text"}) | |
if "refer" in list.find("p").text: | |
flag=False | |
return flag | |
#first sentence from first paragraph | |
def get_first_sentence(my_string): | |
linking_verbs = set(['was', 'is', 'are', 'were']) | |
split_string = my_string.split(' ') | |
first_sentence = [] | |
linked_verb_booly = False | |
for ele in split_string: | |
first_sentence.append(ele) | |
if ele in linking_verbs: | |
linked_verb_booly = True | |
if '.' in ele and linked_verb_booly == True: | |
break | |
return ' '.join(first_sentence) | |
#if there are same naems, return the right one's href related to cryptocurrency | |
#if not, returns "\n" | |
def same_name_href(body): | |
keywords=['cryptocurrency','bitcoin''crypto'] | |
key_name="" | |
same_name=[] | |
lis=(body.find("div",{"id":"mw-content-text"}).findAll("li")) | |
for hrefs in lis: #get the list of same name people's href | |
if "/wiki/" in hrefs.a['href']: | |
same_name.append(hrefs.a['href']) | |
for name in same_name: | |
html=urlopen("https://en.wikipedia.org"+name) | |
same_name_body=bs(html) | |
info=same_name_body.find("p").text | |
for keyword in keywords: #filter to check "crpyto" | |
if keyword in info: | |
key_name=name.split("/wiki/")[1] | |
return key_name | |
#to get all the data | |
big_data=[] | |
for name in list: | |
json_data={} | |
career_text="" | |
try: | |
name=name.replace(" ","_") | |
html=urlopen("http://en.wikipedia.org/wiki/"+name) | |
except HTTPError as e: #no such page | |
json_data['name']=name | |
json_data['des']="Missing" | |
json_data['career']="Missing" | |
big_data.append(json_data) | |
pass | |
else: | |
body=bs(html) | |
if not same_name_check(body) : | |
single_name=same_name_href(body) | |
if (single_name) == "":#found people but not related to crpytocurrency | |
json_data['name']=name | |
json_data['des']="Missing" | |
json_data['career']="Missing" | |
continue | |
else: | |
html=urlopen("http://en.wikipedia.org/wiki/"+single_name) | |
body=bs(html) | |
name=body.find("h1",{"id":"firstHeading"}) | |
if name.text in "(": #e.g)"delete () parts from "Joseph Lubin(teacher)" | |
name=name.text.split("(")[0] | |
des=get_first_sentence((body.find("div",{"id":"mw-content-text"}).find("p").getText())) #fist sentence from first paragraph | |
json_data['des']=des | |
big_data.append(json_data) | |
careers=body.findAll("span",{"id":re.compile("career",re.I)}) #find career section | |
if len(careers)==0: #can't find career section | |
json_data['career']="Missing" | |
career_text="Missing" | |
else: | |
for career in careers: #possibility to have more than one career sections | |
siblings=career.parent.next_siblings | |
for brother in siblings: | |
if brother.name==None: | |
pass | |
elif brother.name=="h2": | |
break | |
else: | |
tmp=(brother.text.encode('utf-8')) | |
career_text+=tmp | |
json_data['career']=career_text | |
print((name.text)) | |
print(des.strip()) | |
print(career_text) | |
print("================") | |
big_data.append(json_data) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment