Skip to content

Instantly share code, notes, and snippets.

@riza
Created August 1, 2018 15:07
Show Gist options
  • Save riza/15be7e513120274aab1872dc62b6479e to your computer and use it in GitHub Desktop.
Save riza/15be7e513120274aab1872dc62b6479e to your computer and use it in GitHub Desktop.
https://hiphoplife.com.tr üzerindeki tüm albümleri/singleları çekmek için bişi. Kahrolsun arşivci kişiliğim. 🤔
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# TODO: Mega downloader
import json
import requests
from bs4 import BeautifulSoup
import re
import codecs
class HLCrawler():
def getMagaLink(self,url):
r = requests.get(url)
m = re.search("window.location = '(.*?)'", r.text)
redirect = m.group(1)
r = requests.get(redirect)
return r.url
def crawlPagination(self,url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
try:
tracks = soup.findAll("div", { "class" : "remositoryfileblock" })
for track in tracks:
title = track.findAll("a")[0].text
downloadUrlHL = track.findAll("a", href=True)[1]["href"]
downloadUrlHL = self.getMagaLink(downloadUrlHL)
self.PAGES[self.pagesKey] = {'title': title, 'url':downloadUrlHL}
print("{0} Bulundu -> URL -> {1}".format(title.encode('utf-8').strip(),downloadUrlHL))
self.pagesKey += 1
except IndexError:
print("Dead. \n")
def __init__(self):
self.URL = "https://www.hiphoplife.com.tr/dosyalar/Mp3/orderby,2/page,{0}/";
self.TRACKS = {}
self.PAGES = {}
self.MAX_PAGE = 196
self.pagesKey = 0
for x in xrange(1,self.MAX_PAGE + 1):
print("\n{0}. sayfa crawl ediliyor.\n".format(x))
self.crawlPagination(self.URL.format(x))
if __name__ == "__main__":
HLCrawler()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment