Skip to content

Instantly share code, notes, and snippets.

@Parassharmaa
Created May 1, 2020 05:07
Show Gist options
  • Save Parassharmaa/ed9ff32fa8ee62acbbf12da9505287bd to your computer and use it in GitHub Desktop.
Save Parassharmaa/ed9ff32fa8ee62acbbf12da9505287bd to your computer and use it in GitHub Desktop.
Python Script to Fetch Books from Libgen
import requests
from bs4 import BeautifulSoup
base_url = "http://gen.lib.rus.ec"
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0'}
def get_download_url(u):
u = "http://libgen.io/ads.php?md5="+u
d = requests.get(u, headers=headers)
soup = BeautifulSoup(d.text, "html.parser")
urs = soup.find_all("a")[1].attrs['href']
return urs
class Booky:
def __init__(self, name):
self.name = name
self.data = {"books":[]}
def crawl(self):
url = "http://libgen.io/search.php?req={}&open=0&res=25&view=detailed&phrase=0&column=def".format(self.name)
d = requests.get(url, headers=headers)
soup = BeautifulSoup(d.content, "html.parser")
data = soup.find_all("table")[3:17]
if len(data):
for i in data:
trs = i.find_all('tr')
if len(trs)==14:
img_url = base_url+trs[1].find_all("img")[0].attrs['src']
dwn_link = base_url+trs[1].find_all("a")[0].attrs['href']
book_name = trs[1].find_all('td')[2].text
author = trs[2].find_all('td')[1].text.split(',')[0]
ext = trs[9].find_all('td')[3].text
size = trs[9].find_all('td')[1].text
temp = {
"image":img_url,
"title": book_name,
"author": author,
"file": dwn_link,
"ext": ext,
"size": size
}
self.data['books'].append(temp)
if __name__=="__main__":
t = Booky("Python")
t.crawl()
print(t.data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment