Skip to content

Instantly share code, notes, and snippets.

@dahnielson
Last active March 14, 2017 19:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dahnielson/097628605744715bc346433bb3ac32b6 to your computer and use it in GitHub Desktop.
Save dahnielson/097628605744715bc346433bb3ac32b6 to your computer and use it in GitHub Desktop.
Sitemap for webscraper.io and Python script
{
"startUrl": "https://del.icio.us/<your username>?&page=[1-100]",
"selectors": [{
"parentSelectors": ["_root"],
"type": "SelectorElement",
"multiple": true,
"id": "bookmark",
"selector": "div.articleThumbBlockOuter",
"delay": ""
}, {
"parentSelectors": ["bookmark"],
"type": "SelectorText",
"multiple": false,
"id": "title",
"selector": "a.title",
"regex": "",
"delay": ""
}, {
"parentSelectors": ["bookmark"],
"type": "SelectorElementAttribute",
"multiple": false,
"id": "link",
"selector": "p:nth-of-type(1) a",
"delay": "",
"extractAttribute": "href"
}, {
"parentSelectors": ["bookmark"],
"type": "SelectorGroup",
"id": "tag",
"selector": "ul.tagName a",
"extractAttribute": "",
"delay": ""
}, {
"parentSelectors": ["bookmark"],
"type": "SelectorElementAttribute",
"multiple": false,
"id": "date",
"selector": "_parent_",
"extractAttribute": "date",
"delay": ""
}, {
"parentSelectors": ["bookmark"],
"type": "SelectorElementAttribute",
"multiple": false,
"id": "private",
"selector": "div.articleThumbBlock",
"extractAttribute": "class",
"delay": ""
}],
"_id": "delicious"
}
import csv
import json
import urllib.request
import urllib.parse
from datetime import datetime
bookmark_file = '<exported csv file name>'
pinboard_token = '<your pinboard token>'
def process_tags(csv_tags):
json_tags = json.loads('{ "tags": %s }' % csv_tags)
json_tags = json_tags['tags']
tags = []
for json_tag in json_tags:
tags.append(json_tag['tag'])
return tags
with open(bookmark_file, newline='', encoding="utf8") as csvfile:
bookmarks = csv.reader(csvfile, delimiter=',', quotechar='"')
next(bookmarks) # Skip header
for bookmark in bookmarks:
bookmark_post_data = {
'url': bookmark[1],
'description': bookmark[0],
'dt': datetime.fromtimestamp(int(bookmark[3]), tz=None).isoformat(),
'shared': 'no' if 'privateArticle' in bookmark[4].split() else 'yes'
}
tags = process_tags(bookmark[2])
if len(bookmark[2]) > 0:
bookmark_post_data['tags'] = ",".join(tags)
bookmark_post_data['auth_token'] = pinboard_token
print("Adding %s" % bookmark[1])
params = urllib.parse.urlencode(bookmark_post_data)
f = urllib.request.urlopen("https://api.pinboard.in/v1/posts/add?%s" % params)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment