Skip to content

Instantly share code, notes, and snippets.

@NetBUG
Created September 11, 2016 16:33
Show Gist options
  • Save NetBUG/c706008a6f475912d620e51b626f6cac to your computer and use it in GitHub Desktop.
Save NetBUG/c706008a6f475912d620e51b626f6cac to your computer and use it in GitHub Desktop.
#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup
def clean(url):
r = requests.get(url)
html = r.text
soup = BeautifulSoup(html, "lxml")
for script in soup(["script", "style"]):
script.extract() # rip it out
sOut = soup.text
sOut = sOut.replace("\r", "\n")
while sOut.find("\n\n") > -1:
sOut = sOut.replace("\n\n", "\n")
return sOut.strip()
if __name__ == '__main__':
url = 'http://webcache.googleusercontent.com/search?q=cache:JqL5UFr1tGoJ:ru.fallout.wikia.com/wiki/%25D0%259F%25D0%25BB%25D0%25B0%25D1%2581%25D1%2582%25D0%25B8%25D0%25BA%25D0%25BE%25D0%25B2%25D0%25B0%25D1%258F_%25D0%25B2%25D0%25B7%25D1%2580%25D1%258B%25D0%25B2%25D1%2587%25D0%25B0%25D1%2582%25D0%25BA%25D0%25B0+&cd=9&hl=en&ct=clnk&gl=ru&client=ubuntu'
print (clean(url))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment