Last active
February 7, 2019 03:51
-
-
Save bhairavmehta95/356803e9f7e0d3f1b574b577265e9ae4 to your computer and use it in GitHub Desktop.
A simple text extraction and preprocessing script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import re | |
import numpy as np | |
def preprocess_raw_text(raw_text): | |
raw_text = raw_text.replace('-', ' ') | |
raw_text = raw_text.replace('\n', ' ') | |
raw_text = raw_text.replace('...', ' ') | |
raw_text = raw_text.replace(' ', ' ').encode('ascii', 'ignore') | |
try: | |
raw_text = raw_text.decode("utf-8") | |
except: | |
pass | |
return raw_text | |
def extract_text(url): | |
r = requests.get(url).text | |
soup = BeautifulSoup(r, 'html.parser') | |
text_p = soup.find_all('p', text=True) | |
raw_text = ' '.join([item.string for item in text_p]) | |
return preprocess_raw_text(raw_text) | |
if __name__ == '__main__': | |
print(extract_text('https://www.nytimes.com/2019/01/25/science/mars-opportunity-rover.html')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment