Skip to content

Instantly share code, notes, and snippets.

@emreuenal
Created February 9, 2020 18:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emreuenal/4df6073332d305a38f564b525c2a240d to your computer and use it in GitHub Desktop.
Save emreuenal/4df6073332d305a38f564b525c2a240d to your computer and use it in GitHub Desktop.
A small python script to download all papers from http://nek.istanbul.edu.tr:4444/ekos/GAZETE/
import requests
import os
from bs4 import BeautifulSoup
url = "http://nek.istanbul.edu.tr:4444/ekos/GAZETE/"
r = requests.get(url)
response = BeautifulSoup(r.text, 'lxml')
section = response.find("section", class_="tm-section")
table_rows = section.find_all("tr")
for row in table_rows:
if row.find("td", class_="tm-text-left"):
paper_name = row.find("td", class_="tm-text-left").text
print(paper_name)
paper_link = row.find("a").attrs["href"]
if not os.path.exists(paper_name):
os.makedirs(paper_name)
paper_folder_path = os.path.abspath(paper_name)
r = requests.get(paper_link)
response = BeautifulSoup(r.text, 'lxml')
paper_years = response.find_all("tr")
for paper_year in paper_years:
if paper_year.find("td", class_="tm-text-left"):
paper_year_text = paper_year.find("td", class_="tm-text-left").text
paper_year_url = paper_year.find("a").attrs["href"]
year_folder_path = paper_folder_path + "/" + paper_year_text
if not os.path.exists(year_folder_path):
os.makedirs(year_folder_path)
rpy = requests.get(paper_year_url)
responsepy = BeautifulSoup(rpy.text, 'lxml')
papers = responsepy.find_all("td")
for paper in papers:
if paper.find("a"):
paper_down_url = paper.find("a").attrs["href"]
paper_file_name = paper_down_url.split("/")[-1]
where_to_download = year_folder_path + "/" + paper_file_name
if not os.path.isfile(where_to_download):
paper_r = requests.get(paper_down_url, allow_redirects=True)
print("Downloading: " + paper_file_name)
with open(where_to_download, 'wb') as fd:
for chunk in paper_r.iter_content(chunk_size=128):
fd.write(chunk)
else:
print("already downloaded")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment