Skip to content

Instantly share code, notes, and snippets.

@zarzen
Created March 27, 2018 22:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zarzen/3cd42175d2647adbd3d6b9d17a6ca9ac to your computer and use it in GitHub Desktop.
Save zarzen/3cd42175d2647adbd3d6b9d17a6ca9ac to your computer and use it in GitHub Desktop.
apkpure crawler
import sqlite3
from sqlite3 import Error
from tqdm import tqdm
import requests
import urllib.request
from pyquery import PyQuery as pq
from os.path import join
from requests import HTTPError
import time
def create_conn(db_file):
try:
conn = sqlite3.connect(db_file)
return conn
except Error as e:
print(e)
return None
def get_byid(conn, table, entry_id):
"""
:param conn: sqlite database connection
:type conn: sqlite3.Connection
"""
cur = conn.cursor()
query_str = "select * from {} where id={}".format(table,
entry_id)
cur.execute(query_str)
rows = cur.fetchall()
return rows[0]
def download_apk(url, app_id, save_to):
app_apk = app_id + '.apk'
response = requests.get(url, stream=True)
with open(join(save_to, app_apk), 'wb') as out_file:
for data in response.iter_content(chunk_size=5*1024*1024):
out_file.write(data)
def get_link(app_id):
download_link_template = "https://apkpure.com/apkpure/{}/download?from=details"
link = None
try:
download_page_url = download_link_template.format(app_id)
download_page = pq(url=download_page_url)
direct_download_btn = download_page("#download_link")
fsize = download_page('body > div.main.page-q > div.left > div:nth-child(2) > div.fast-download-box > h1 > span.file > span')
fsize = fsize.text().strip('(').strip(')').split()[0]
# ignore the file greater than 100 MB
if float(fsize) > 100.0:
link = None
else:
link = direct_download_btn.attr('href')
except HTTPError as e:
print(app_id, e)
except Exception as e:
print(app_id, e)
return link
def main():
""""""
meta_info_db = "/home/zarzen/Dev/autocog/googleplayapps20170310metainfo.db"
apks_dir = './apks'
db_conn = create_conn(meta_info_db)
id_start = 5
id_end = 1000
print('*'*20 + 'id range {} - {}'.format(id_start, id_end))
for i in tqdm(range(id_start, id_end + 1)):
entry = get_byid(db_conn, "metainfo", i)
# print(entry[1]) # first column is package name
app_id = entry[1]
apklink = get_link(app_id)
if apklink is not None:
print('downloading', app_id)
download_apk(apklink, app_id, apks_dir)
else:
print(app_id, 'not found')
time.sleep(3)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment