Skip to content

Instantly share code, notes, and snippets.

@jessfraz
Created December 28, 2018 23:00
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jessfraz/a1aa52c7755bdac0a7f8d9e6fe701aca to your computer and use it in GitHub Desktop.
Save jessfraz/a1aa52c7755bdac0a7f8d9e6fe701aca to your computer and use it in GitHub Desktop.
Scrape CIA public PDF files
docker build --rm --force-rm -t r.j3ss.co/scrape . 

docker run --rm -it -v $(pwd)/results:/root/cia r.j3ss.co/scrape
#!/usr/local/bin/python
import lxml.html
from lxml.html.clean import Cleaner
import requests
import os.path
import signal
import sys
# Handle control+C.
def signal_handler(sig, frame):
print('You pressed Ctrl+C! Exiting...')
sys.exit(0)
def lxmlize(url):
print("GET-ing", url)
resp = requests.get(url)
page = lxml.html.fromstring(resp.text)
page.make_links_absolute(url)
return page
def publications(url):
page = lxmlize(url)
for href in page.xpath("//div[@id='content-core']//a[contains(@href, 'csi-studies') and contains(@href, '.html')]"):
yield href
def articles(url):
page = lxmlize(url)
for el in page.xpath("//a[contains(@href, '.pdf')]"):
href = el.attrib['href']
yield href, os.path.basename(href).replace("-", " ").replace(".pdf", "").replace("pdf", "").replace("%20", " ").title()
def main(page):
pdir = "Unclassified Extracts from Classified Studies"
parentDir = "cia/" + pdir
for publication in publications(page):
publication_name = unicode(publication.text_content())
name = publication_name.replace(pdir, "").strip()
for href, text in articles(publication.attrib['href']):
full_path = "{}/{}/{}.pdf".format(parentDir, name, text)
if os.path.exists(full_path):
continue
print(full_path)
if not os.path.exists(parentDir+"/"+name):
os.makedirs(parentDir+"/"+name)
response = requests.get(href)
with open(full_path, 'wb') as fd:
for block in response.iter_content(1024):
fd.write(block)
signal.signal(signal.SIGINT, signal_handler)
main("https://www.cia.gov/library/center-for-the-study-of-intelligence")
FROM python:2-alpine
RUN apk add --no-cache \
gcc \
libxml2-dev \
libxslt-dev \
musl-dev
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/include/libxml2
ENV C_INCLUDE_PATH=$C_INCLUDE_PATH:/usr/include/libxml2
RUN pip install \
lxml \
requests
COPY cia.py /usr/local/bin/
RUN chmod +x /usr/local/bin/cia.py
WORKDIR /root
CMD ["cia.py"]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment