Created
March 27, 2018 22:28
-
-
Save zarzen/3cd42175d2647adbd3d6b9d17a6ca9ac to your computer and use it in GitHub Desktop.
apkpure crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
from sqlite3 import Error | |
from tqdm import tqdm | |
import requests | |
import urllib.request | |
from pyquery import PyQuery as pq | |
from os.path import join | |
from requests import HTTPError | |
import time | |
def create_conn(db_file): | |
try: | |
conn = sqlite3.connect(db_file) | |
return conn | |
except Error as e: | |
print(e) | |
return None | |
def get_byid(conn, table, entry_id): | |
""" | |
:param conn: sqlite database connection | |
:type conn: sqlite3.Connection | |
""" | |
cur = conn.cursor() | |
query_str = "select * from {} where id={}".format(table, | |
entry_id) | |
cur.execute(query_str) | |
rows = cur.fetchall() | |
return rows[0] | |
def download_apk(url, app_id, save_to): | |
app_apk = app_id + '.apk' | |
response = requests.get(url, stream=True) | |
with open(join(save_to, app_apk), 'wb') as out_file: | |
for data in response.iter_content(chunk_size=5*1024*1024): | |
out_file.write(data) | |
def get_link(app_id): | |
download_link_template = "https://apkpure.com/apkpure/{}/download?from=details" | |
link = None | |
try: | |
download_page_url = download_link_template.format(app_id) | |
download_page = pq(url=download_page_url) | |
direct_download_btn = download_page("#download_link") | |
fsize = download_page('body > div.main.page-q > div.left > div:nth-child(2) > div.fast-download-box > h1 > span.file > span') | |
fsize = fsize.text().strip('(').strip(')').split()[0] | |
# ignore the file greater than 100 MB | |
if float(fsize) > 100.0: | |
link = None | |
else: | |
link = direct_download_btn.attr('href') | |
except HTTPError as e: | |
print(app_id, e) | |
except Exception as e: | |
print(app_id, e) | |
return link | |
def main(): | |
"""""" | |
meta_info_db = "/home/zarzen/Dev/autocog/googleplayapps20170310metainfo.db" | |
apks_dir = './apks' | |
db_conn = create_conn(meta_info_db) | |
id_start = 5 | |
id_end = 1000 | |
print('*'*20 + 'id range {} - {}'.format(id_start, id_end)) | |
for i in tqdm(range(id_start, id_end + 1)): | |
entry = get_byid(db_conn, "metainfo", i) | |
# print(entry[1]) # first column is package name | |
app_id = entry[1] | |
apklink = get_link(app_id) | |
if apklink is not None: | |
print('downloading', app_id) | |
download_apk(apklink, app_id, apks_dir) | |
else: | |
print(app_id, 'not found') | |
time.sleep(3) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment