Skip to content

Instantly share code, notes, and snippets.

@iamaziz
Created July 17, 2016 03:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iamaziz/406840cef049cfeef053fe3d71119337 to your computer and use it in GitHub Desktop.
Save iamaziz/406840cef049cfeef053fe3d71119337 to your computer and use it in GitHub Desktop.
generate gensim docset
# generate gensim docset
# http://radimrehurek.com/gensim/
#----------------------------------
# built-in packages
import sqlite3
import os
import urllib
import plistlib
#----------------------------------
# third party packages + httrack
import requests
from bs4 import BeautifulSoup as bs
# prepare docset folder and html
def setup_docset(doc_name, url, download_html=False):
# docset settings
docset_name = '{}.docset'.format(doc_name)
output = docset_name + '/Contents/Resources/Documents/'
# docset directory
if not os.path.exists(output):
os.makedirs(output)
cmd_command = """
cd {0} &&
httrack -%v2 -T60 -R99 --sockets=7 -%c1000 -c10 -A999999999 -%N0 --disable-security-limits -F 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.19 (KHTML, like Gecko) Ubuntu/11.10 Chromium/18.0.1025.168' --mirror --keep-alive --robots=0 "{1}" -n -* +*.css +*css.php +*.ico +*/fonts/* +*.svg +*.ttf +fonts.googleapis.com* +*.woff +*.eot +*.png +*.jpg +*.gif +*.jpeg +*.js +{1}* -github.com* +raw.github.com* &&
rm -rf hts-* &&
mkdir -p Contents/Resources/Documents &&
mv -f *.* Contents/Resources/Documents/
""".format(docset_name, url)
if download_html:
os.system(cmd_command)
return docset_name
def update_db(name, typ, path):
try:
cur.execute("SELECT rowid FROM searchIndex WHERE path = ?", (path,))
dbpath = cur.fetchone()
cur.execute("SELECT rowid FROM searchIndex WHERE name = ?", (name,))
dbname = cur.fetchone()
if dbpath is None and dbname is None:
cur.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (name, typ, path))
print('DB add >> name: {0} | type: {1} | path: {2}'.format(name, typ, path))
else:
print("record exists")
except:
pass
def add_infoplist(base_page, docset_name):
index_file = base_page.split("//")[1]
name = docset_name.split('.')[0]
plist_path = os.path.join(docset_name, "Contents", "Info.plist")
plist_cfg = {
'CFBundleIdentifier': name,
'CFBundleName': name,
'DocSetPlatformFamily': name.lower(),
'DashDocSetFamily': 'python',
'isDashDocset': True,
'dashIndexFilePath': index_file
}
plistlib.writePlist(plist_cfg, plist_path)
def add_urls(pages):
# loop through index pages:
for p in pages:
# setup paths
page_name = pages[p].split('/')[-1]
base_path = pages[p].split("//")[1]
# soup each index page
html = requests.get(pages[p]).text
soup = bs(html)
for a in soup.findAll('a', class_='reference internal'):
name = a.text.strip()
path = a.get('href')
name = " ".join(name.split())
f = ['Guide', 'Library']
if path is not None and p not in f:
path = base_path + path
update_db(name, p, path)
if p in f:
path = base_path.replace(page_name, '') + path
update_db(name, p, path)
def main():
# index pages
name = 'gensim'
website = 'http://radimrehurek.com/gensim/'
pages = {
'Section' : website,
'Guide' : 'http://radimrehurek.com/gensim/tutorial.html',
'Library' : 'http://radimrehurek.com/gensim/apiref.html'
}
docset_name = setup_docset(name, website, download_html=True)
# docset icon
icon = 'https://www.python.org/static/apple-touch-icon-precomposed.png'
urllib.urlretrieve(icon, docset_name + "/icon.png")
# create and connect to SQLite
db = sqlite3.connect(docset_name + '/Contents/Resources/docSet.dsidx')
global cur
cur = db.cursor()
try:
cur.execute('DROP TABLE searchIndex;')
except:
cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')
# docset entries
add_urls(pages)
add_infoplist(website, docset_name)
# report num of entries
cur.execute('Select count(*) from searchIndex;')
entry = cur.fetchone()
print("{} entry.".format(entry))
# commit and close db
db.commit()
db.close()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment