Skip to content

Instantly share code, notes, and snippets.

@aurora1625
Created November 20, 2013 23:46
Show Gist options
  • Save aurora1625/7573349 to your computer and use it in GitHub Desktop.
Save aurora1625/7573349 to your computer and use it in GitHub Desktop.
Extract only the abstract of PUBMED raw data
__author__ = 'sean'
from bs4 import BeautifulSoup
import os
import cPickle as pickle
path = '/Users/sean/ml/dataset/pubmed-bioinfo-abstracts/paperAbstracts/'
filenames = os.listdir(path)
txt_corpus = list()
for thefile in filenames:
print thefile
# deal with the damn .DS_Store file in MAC
if thefile == ".DS_Store":
continue
with open(path + thefile, "rb") as f:
strings = f.read()
soup = BeautifulSoup(strings)
for hit in soup.findAll(attrs={'class' : 'abstract_text'}):
abstract = hit.contents[1].text
txt_corpus.append(abstract)
print 'done'
with open('pubmed_abstract.pkl', 'wb') as dicpkl:
pickle.dump(txt_corpus, dicpkl)
print 'pickle saved'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment