Created
July 17, 2016 03:21
-
-
Save iamaziz/406840cef049cfeef053fe3d71119337 to your computer and use it in GitHub Desktop.
generate gensim docset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# generate gensim docset | |
# http://radimrehurek.com/gensim/ | |
#---------------------------------- | |
# built-in packages | |
import sqlite3 | |
import os | |
import urllib | |
import plistlib | |
#---------------------------------- | |
# third party packages + httrack | |
import requests | |
from bs4 import BeautifulSoup as bs | |
# prepare docset folder and html | |
def setup_docset(doc_name, url, download_html=False): | |
# docset settings | |
docset_name = '{}.docset'.format(doc_name) | |
output = docset_name + '/Contents/Resources/Documents/' | |
# docset directory | |
if not os.path.exists(output): | |
os.makedirs(output) | |
cmd_command = """ | |
cd {0} && | |
httrack -%v2 -T60 -R99 --sockets=7 -%c1000 -c10 -A999999999 -%N0 --disable-security-limits -F 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.19 (KHTML, like Gecko) Ubuntu/11.10 Chromium/18.0.1025.168' --mirror --keep-alive --robots=0 "{1}" -n -* +*.css +*css.php +*.ico +*/fonts/* +*.svg +*.ttf +fonts.googleapis.com* +*.woff +*.eot +*.png +*.jpg +*.gif +*.jpeg +*.js +{1}* -github.com* +raw.github.com* && | |
rm -rf hts-* && | |
mkdir -p Contents/Resources/Documents && | |
mv -f *.* Contents/Resources/Documents/ | |
""".format(docset_name, url) | |
if download_html: | |
os.system(cmd_command) | |
return docset_name | |
def update_db(name, typ, path): | |
try: | |
cur.execute("SELECT rowid FROM searchIndex WHERE path = ?", (path,)) | |
dbpath = cur.fetchone() | |
cur.execute("SELECT rowid FROM searchIndex WHERE name = ?", (name,)) | |
dbname = cur.fetchone() | |
if dbpath is None and dbname is None: | |
cur.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (name, typ, path)) | |
print('DB add >> name: {0} | type: {1} | path: {2}'.format(name, typ, path)) | |
else: | |
print("record exists") | |
except: | |
pass | |
def add_infoplist(base_page, docset_name): | |
index_file = base_page.split("//")[1] | |
name = docset_name.split('.')[0] | |
plist_path = os.path.join(docset_name, "Contents", "Info.plist") | |
plist_cfg = { | |
'CFBundleIdentifier': name, | |
'CFBundleName': name, | |
'DocSetPlatformFamily': name.lower(), | |
'DashDocSetFamily': 'python', | |
'isDashDocset': True, | |
'dashIndexFilePath': index_file | |
} | |
plistlib.writePlist(plist_cfg, plist_path) | |
def add_urls(pages): | |
# loop through index pages: | |
for p in pages: | |
# setup paths | |
page_name = pages[p].split('/')[-1] | |
base_path = pages[p].split("//")[1] | |
# soup each index page | |
html = requests.get(pages[p]).text | |
soup = bs(html) | |
for a in soup.findAll('a', class_='reference internal'): | |
name = a.text.strip() | |
path = a.get('href') | |
name = " ".join(name.split()) | |
f = ['Guide', 'Library'] | |
if path is not None and p not in f: | |
path = base_path + path | |
update_db(name, p, path) | |
if p in f: | |
path = base_path.replace(page_name, '') + path | |
update_db(name, p, path) | |
def main(): | |
# index pages | |
name = 'gensim' | |
website = 'http://radimrehurek.com/gensim/' | |
pages = { | |
'Section' : website, | |
'Guide' : 'http://radimrehurek.com/gensim/tutorial.html', | |
'Library' : 'http://radimrehurek.com/gensim/apiref.html' | |
} | |
docset_name = setup_docset(name, website, download_html=True) | |
# docset icon | |
icon = 'https://www.python.org/static/apple-touch-icon-precomposed.png' | |
urllib.urlretrieve(icon, docset_name + "/icon.png") | |
# create and connect to SQLite | |
db = sqlite3.connect(docset_name + '/Contents/Resources/docSet.dsidx') | |
global cur | |
cur = db.cursor() | |
try: | |
cur.execute('DROP TABLE searchIndex;') | |
except: | |
cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);') | |
cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);') | |
# docset entries | |
add_urls(pages) | |
add_infoplist(website, docset_name) | |
# report num of entries | |
cur.execute('Select count(*) from searchIndex;') | |
entry = cur.fetchone() | |
print("{} entry.".format(entry)) | |
# commit and close db | |
db.commit() | |
db.close() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment