Skip to content

Instantly share code, notes, and snippets.

Created December 21, 2019 10:14
Show Gist options
  • Save niujiabenben/76b082a4e94ad922b8f946a7c1825995 to your computer and use it in GitHub Desktop.
Save niujiabenben/76b082a4e94ad922b8f946a7c1825995 to your computer and use it in GitHub Desktop.
# coding: utf-8
获取某个顶会的文章的引用数, 并输出其前10%的文章列表.
由于google scholor有防爬机制, 这里假定所有的html的源文件都手动下载到本地.
import re
import os
from bs4 import BeautifulSoup
####################### Parse google scholar pages ############################
def _parse_article(article):
"""解析google scholar网页中的一个article."""
title = article.find_all("h3", "gs_rt")
assert len(title) == 1, title
title = title[0].find_all("a", href=re.compile("^http"))
assert len(title) == 1, title
### 这里在某些特殊字符(非ascii)的情况下会为None
title = title[0].string
cite = article.find_all("a", href=re.compile("cites"))
### 某些情况下文章没有cite信息
assert len(cite) <= 1, str(cite)
cite = int(cite[0].string.split()[-1]) if len(cite) > 0 else -1
return title, cite
def _parse_html(html):
"""解析一个(本地的)google scholar网页."""
with open(html, "r", encoding="utf-8") as srcfile:
content =
soup = BeautifulSoup(content, features="html.parser")
articles = []
for article in soup.find_all("div", "gs_ri"):
title, cite = _parse_article(article)
if title is not None:
articles.append((title.lower(), cite))
return articles
def get_all_articles(html_dir):
"""解析一个google scholar网页的目录. 这些网页是手动保存到本地的."""
articles = []
for name in os.listdir(html_dir):
path = os.path.join(html_dir, name)
return articles
######################## Parse openaccess page ################################
def get_all_titles(openaccess_file):
"""解析openaccess网页. 这个网页也是手动保存到本地的."""
with open(openaccess_file, "r", encoding="utf-8") as srcfile:
content =
soup = BeautifulSoup(content, features="html.parser")
all_titles = []
pattern = re.compile("^title = {(.+)},$")
for article in soup.find_all("div", "bibref"):
title_line = article.get_text().split("\n")[3]
match = pattern.match(title_line)
assert match is not None
return all_titles
############################## main part ######################################
def main(html_dir, openaccess_file):
articles = get_all_articles(html_dir)
titles = get_all_titles(openaccess_file)
### 归一化article的title
regex = re.compile('[^a-zA-Z]')
normed_cite_map = {}
for title, cite in articles:
normed = regex.sub("", title)
normed_cite_map[normed] = cite
normed_titles = [regex.sub("", title) for title in titles]
normed_title_map = dict(zip(normed_titles, titles))
### 统计没被google scholar搜索到的文章有多少
outside = set(normed_titles) - set(normed_cite_map.keys())
found = len(normed_titles) - len(outside)
print("titles: {}, found: {}, outside: {}".format(
len(titles), found, len(outside)))
### 输出按引用排序的文章 (top10%)
merged_map = {normed: -100 for normed in normed_titles}
for normed, cite in normed_cite_map.items():
if normed in merged_map:
merged_map[normed] = max(merged_map[normed], cite)
final_results = sorted(merged_map.items(), key=lambda x: -x[1])
top10p = len(final_results) // 10
for idx, (normed, cite) in enumerate(final_results[:top10p]):
print("{:<140} {}".format(normed_title_map[normed], cite))
if __name__ == "__main__":
main("./cvpr2017", "./cvpr2017.html")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment