celiacintas/scrap_script.py

## scrap_script.py
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

import BeautifulSoup as soup
import pandas as pd
import httplib2
import re

CODE_FAM = 	['MGP00016', 'MGP00015', 'MGP00022', 'MGP00003', 'MGP00012', 'MGP00005',
		 'MGP00023', 'MGP00001', 'MGP00019', 'MGP00018', 'MGP00017', 'MGP00002', 'MGP00010',
		 'MGP00025', 'MGP00009', 'MGP00014', 'MGP00008', 'MGP00021', 'MGP00006', 'MGP00011',
		 'MGP00004', 'MGP00020', 'MGP00024', 'MGP00007', 'MGP00013']

URL_IND = 'https://catalog.coriell.org/0/Sections/Search/'
URL_FAM = 'https://catalog.coriell.org/0/Sections/Search/Panel_Detail.aspx?PgId=202&Ref='

def get_people_remark(url):
	"""Get the id and remark with the html ids"""

	http = httplib2.Http(".cache", disable_ssl_certificate_validation=True)
	headers, body = http.request(url)

	my_soup = soup.BeautifulSoup(body)
	remark = my_soup.find('span', {'id':'lblCat_Remark'})
	id_ = my_soup.find('span', {'id':'lblRef'})

	return  id_.text, remark.text

def main():
	lines = []
	for code_fam in CODE_FAM:
		# this is for each family code
		http = httplib2.Http(".cache", disable_ssl_certificate_validation=True)
		headers, body = http.request(URL_FAM + code_fam)

		my_soup = soup.BeautifulSoup(body)
		list_individuals = my_soup.findAll('a', href=re.compile('^Sample\_Detail'))

		for ind in list_individuals:
			id_mark = get_people_remark(URL_IND + ind['href'])
			lines.append([code_fam, id_mark[0], id_mark[1]])

	df = pd.DataFrame(lines)
	df.to_csv('/tmp/test.txt', index=None, header=None)


if __name__ == '__main__':
	main()
	#!/usr/bin/env python2
	# -- coding: utf-8 --

	import BeautifulSoup as soup
	import pandas as pd
	import httplib2
	import re

	CODE_FAM = ['MGP00016', 'MGP00015', 'MGP00022', 'MGP00003', 'MGP00012', 'MGP00005',
	'MGP00023', 'MGP00001', 'MGP00019', 'MGP00018', 'MGP00017', 'MGP00002', 'MGP00010',
	'MGP00025', 'MGP00009', 'MGP00014', 'MGP00008', 'MGP00021', 'MGP00006', 'MGP00011',
	'MGP00004', 'MGP00020', 'MGP00024', 'MGP00007', 'MGP00013']

	URL_IND = 'https://catalog.coriell.org/0/Sections/Search/'
	URL_FAM = 'https://catalog.coriell.org/0/Sections/Search/Panel_Detail.aspx?PgId=202&Ref='

	def get_people_remark(url):
	"""Get the id and remark with the html ids"""

	http = httplib2.Http(".cache", disable_ssl_certificate_validation=True)
	headers, body = http.request(url)

	my_soup = soup.BeautifulSoup(body)
	remark = my_soup.find('span', {'id':'lblCat_Remark'})
	id_ = my_soup.find('span', {'id':'lblRef'})

	return id_.text, remark.text

	def main():
	lines = []
	for code_fam in CODE_FAM:
	# this is for each family code
	http = httplib2.Http(".cache", disable_ssl_certificate_validation=True)
	headers, body = http.request(URL_FAM + code_fam)

	my_soup = soup.BeautifulSoup(body)
	list_individuals = my_soup.findAll('a', href=re.compile('^Sample\_Detail'))

	for ind in list_individuals:
	id_mark = get_people_remark(URL_IND + ind['href'])
	lines.append([code_fam, id_mark[0], id_mark[1]])

	df = pd.DataFrame(lines)
	df.to_csv('/tmp/test.txt', index=None, header=None)


	if __name__ == '__main__':
	main()