Skip to content

Instantly share code, notes, and snippets.

@bhairavmehta95
Last active February 7, 2019 03:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bhairavmehta95/356803e9f7e0d3f1b574b577265e9ae4 to your computer and use it in GitHub Desktop.
Save bhairavmehta95/356803e9f7e0d3f1b574b577265e9ae4 to your computer and use it in GitHub Desktop.
A simple text extraction and preprocessing script
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
def preprocess_raw_text(raw_text):
raw_text = raw_text.replace('-', ' ')
raw_text = raw_text.replace('\n', ' ')
raw_text = raw_text.replace('...', ' ')
raw_text = raw_text.replace(' ', ' ').encode('ascii', 'ignore')
try:
raw_text = raw_text.decode("utf-8")
except:
pass
return raw_text
def extract_text(url):
r = requests.get(url).text
soup = BeautifulSoup(r, 'html.parser')
text_p = soup.find_all('p', text=True)
raw_text = ' '.join([item.string for item in text_p])
return preprocess_raw_text(raw_text)
if __name__ == '__main__':
print(extract_text('https://www.nytimes.com/2019/01/25/science/mars-opportunity-rover.html'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment