Skip to content

Instantly share code, notes, and snippets.

@celiacintas
Last active August 29, 2015 14:07
Show Gist options
  • Save celiacintas/d36a75905a4837657d27 to your computer and use it in GitHub Desktop.
Save celiacintas/d36a75905a4837657d27 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import BeautifulSoup as soup
import pandas as pd
import httplib2
import re
CODE_FAM = ['MGP00016', 'MGP00015', 'MGP00022', 'MGP00003', 'MGP00012', 'MGP00005',
'MGP00023', 'MGP00001', 'MGP00019', 'MGP00018', 'MGP00017', 'MGP00002', 'MGP00010',
'MGP00025', 'MGP00009', 'MGP00014', 'MGP00008', 'MGP00021', 'MGP00006', 'MGP00011',
'MGP00004', 'MGP00020', 'MGP00024', 'MGP00007', 'MGP00013']
URL_IND = 'https://catalog.coriell.org/0/Sections/Search/'
URL_FAM = 'https://catalog.coriell.org/0/Sections/Search/Panel_Detail.aspx?PgId=202&Ref='
def get_people_remark(url):
"""Get the id and remark with the html ids"""
http = httplib2.Http(".cache", disable_ssl_certificate_validation=True)
headers, body = http.request(url)
my_soup = soup.BeautifulSoup(body)
remark = my_soup.find('span', {'id':'lblCat_Remark'})
id_ = my_soup.find('span', {'id':'lblRef'})
return id_.text, remark.text
def main():
lines = []
for code_fam in CODE_FAM:
# this is for each family code
http = httplib2.Http(".cache", disable_ssl_certificate_validation=True)
headers, body = http.request(URL_FAM + code_fam)
my_soup = soup.BeautifulSoup(body)
list_individuals = my_soup.findAll('a', href=re.compile('^Sample\_Detail'))
for ind in list_individuals:
id_mark = get_people_remark(URL_IND + ind['href'])
lines.append([code_fam, id_mark[0], id_mark[1]])
df = pd.DataFrame(lines)
df.to_csv('/tmp/test.txt', index=None, header=None)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment