Skip to content

Instantly share code, notes, and snippets.

@chengjun
Created July 8, 2015 05:43
Show Gist options
  • Save chengjun/aee9eaa2772973d5cc1d to your computer and use it in GitHub Desktop.
Save chengjun/aee9eaa2772973d5cc1d to your computer and use it in GitHub Desktop.
tree network
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 07 15:40:57 2015
@author: chengwang6
"""
import urllib2
from bs4 import BeautifulSoup
## Set the seed of crawler
seed = 'https://scholar.google.nl/citations?user=nNdt_G8AAAAJ&hl=en&oe=ASCII'
def coAuthors(url):
coUrls = []
coNames = []
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
s = soup.body.findAll('a', {"class": "gsc_rsb_aa"})
if s:
for i in s:
coNames.append(i.text) #for network plot
coUrls.append('http://scholar.google.nl'+ i['href'])
return coUrls, coNames
def getTree(seed,seedName,Nstep):
m=0
n=0
tree={}
urlToName={seed:seedName}
a,b = coAuthors(seed)
for i,j in zip(a,b):
urlToName[i]=j
tree[seed]=a
while n<= Nstep:
n+=1
for root in tree.keys():
for child in tree[root]:
if child not in tree:
m+=1
print m
a,b = coAuthors(child)
for i,j in zip(a,b):
urlToName[i]=j
tree[child]=a
nameTree={}
for k,v in tree.items():
k=urlToName[k]
v=[urlToName[i] for i in v]
nameTree[k]=v
return nameTree
t = getTree(seed,'chengjun' ,1)
import json
with open("D://namedict.dat",'w') as outfile:
json.dump(t, outfile)
, ensure_ascii=False, encoding = 'utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment