aurora1625/extract_pubmed_abstract.py

## extract_pubmed_abstract.py
__author__ = 'sean'

from bs4 import BeautifulSoup
import os
import cPickle as pickle

path = '/Users/sean/ml/dataset/pubmed-bioinfo-abstracts/paperAbstracts/'
filenames = os.listdir(path)

txt_corpus = list()
for thefile in filenames:
    print thefile
    # deal with the damn .DS_Store file in MAC
    if thefile == ".DS_Store":
        continue
    with open(path + thefile, "rb") as f:
        strings = f.read()
        soup = BeautifulSoup(strings)
        for hit in soup.findAll(attrs={'class' : 'abstract_text'}):
            abstract = hit.contents[1].text
        txt_corpus.append(abstract)
print 'done'
with open('pubmed_abstract.pkl', 'wb') as dicpkl:
    pickle.dump(txt_corpus, dicpkl)
print 'pickle saved'
	__author__ = 'sean'

	from bs4 import BeautifulSoup
	import os
	import cPickle as pickle

	path = '/Users/sean/ml/dataset/pubmed-bioinfo-abstracts/paperAbstracts/'
	filenames = os.listdir(path)

	txt_corpus = list()
	for thefile in filenames:
	print thefile
	# deal with the damn .DS_Store file in MAC
	if thefile == ".DS_Store":
	continue
	with open(path + thefile, "rb") as f:
	strings = f.read()
	soup = BeautifulSoup(strings)
	for hit in soup.findAll(attrs={'class' : 'abstract_text'}):
	abstract = hit.contents[1].text
	txt_corpus.append(abstract)
	print 'done'
	with open('pubmed_abstract.pkl', 'wb') as dicpkl:
	pickle.dump(txt_corpus, dicpkl)
	print 'pickle saved'