Skip to content

Instantly share code, notes, and snippets.

@Amrithasuresh
Created March 6, 2017 10:13
Show Gist options
  • Save Amrithasuresh/2a1056ec90f0248146a3b014ace1212b to your computer and use it in GitHub Desktop.
Save Amrithasuresh/2a1056ec90f0248146a3b014ace1212b to your computer and use it in GitHub Desktop.
#!/usr/bin/python3
from __future__ import print_function
import requests
import re
from datetime import datetime
from bs4 import BeautifulSoup
# open the file
writefile = open('scientific_report_analysis.txt', 'w')
# write the below string as a first line in the file
writefile.write("DOI, Date_recieved, Date_accepted, Date_published, Year, \
Time_taken_to_accept, Time_taken_to_publish, Total_time" + "\n")
# We create a function that takes the text with dates as double codes in the page and returns dates
def double_quotes(text):
try:
# matches string betweent double quotes
matches = re.findall(r'\"(.+?)\"', text)
return matches[0],matches[1],matches[2]
except:
return None
# We create a function that takes Scientific article (url) and fetches \
# "recieved date, reviewed date and online published year
# and also prints the difference in dates as date_recieved, ,date_accepted, date_publised
def fetch_publication_date(url):
source_code = requests.get(url)
plain_text = source_code.text
doi = re.search("doi:\d{2}.\d{4}\/\w+", plain_text)
doi = doi.group(0)
soup = BeautifulSoup(plain_text,"lxml")
# This fetches the word "time"
# Sample "a" variable contains below line
# [<time datetime="2011-02-10">10 February 2011</time>,
# <time datetime="2011-03-01">01 March 2011</time>,
# <time datetime="2011-06-14" itemprop="datePublished">14 June 2011</time>]
a = soup.findAll('time')
# this calls the function "double_quotes" above
if double_quotes(str(a)):
date_recieved, date_accepted, date_published = double_quotes(str(a))
year = re.findall(r'\d{4}',date_published)
year = ','.join(year)
try:
# calculate the difference of date using datetime module
t1 = datetime.strptime(date_recieved, '%Y-%m-%d')
t2 = datetime.strptime(date_accepted, '%Y-%m-%d')
t3 = datetime.strptime(date_published, '%Y-%m-%d')
time_taken_to_accept = ((t2 - t1).days)
time_taken_to_publish = ((t3 - t2).days)
total_time = ((t3 - t1).days)
return doi, date_recieved, date_accepted, date_published, str(year), str(time_taken_to_accept), str(
time_taken_to_publish), str(total_time)
except:
return None
# Scientific report article starts from srep00001 to srep30098
# fetch_publication_date eg ('http://www.nature.com/articles/srep00228')
for i in range(1, 30100, 1):
try:
num = "{:05d}".format(i)
url = "http://www.nature.com/articles/srep" + str(num)
print("Fetching the article number:\n",url)
data = fetch_publication_date(url)
if data:
#print('{0} \n'.format(",".join(str(x) for x in data)))
writefile.write('{0} \n'.format(",".join(str(x) for x in data)))
except:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment