Skip to content

Instantly share code, notes, and snippets.

@nickstenning
Created January 26, 2016 15:58
Show Gist options
  • Save nickstenning/489729480ad8eca8d0a2 to your computer and use it in GitHub Desktop.
Save nickstenning/489729480ad8eca8d0a2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
import argparse
import elasticsearch
import json
def get_from_ids(host, port, index, type, ids):
es = elasticsearch.Elasticsearch([{'host': host, 'port': port}])
for id in ids:
doc = es.get(index=index, doc_type=type, id=id.rstrip('\r\n'))
print("{} {}".format(doc['_id'], json.dumps(doc['_source'], sort_keys=True)))
def main():
parser = argparse.ArgumentParser(
description="Get a bunch of documents whose IDs are stored in a file")
parser.add_argument("--host")
parser.add_argument("--port", type=int)
parser.add_argument("--index")
parser.add_argument("--type")
parser.add_argument("--id_file")
args = parser.parse_args()
id_file = open(args.id_file)
get_from_ids(args.host, args.port, args.index, args.type, id_file)
if __name__ == '__main__':
main()
#!/usr/bin/env python2
import argparse
import elasticsearch
import json
def get_random_docs(host, port, index, type, numdocs, only_id):
es = elasticsearch.Elasticsearch([{'host': host, 'port': port}])
query = {
"size": numdocs,
"query": {
"function_score": {
"functions": [
{
"random_score": {
"seed": 11
}
}
],
"score_mode": "sum",
}
}
}
results = es.search(index=index, doc_type=type, body=query)
for doc in results['hits']['hits']:
if only_id:
print doc['_id']
else:
print("{} {}".format(doc['_id'], json.dumps(doc['_source'], sort_keys=True)))
def main():
parser = argparse.ArgumentParser(
description="Update documents containing via.hypothes.is URLs")
parser.add_argument("--host")
parser.add_argument("--port", type=int)
parser.add_argument("--index")
parser.add_argument("--type")
parser.add_argument("--numdocs", type=int)
parser.add_argument("--only_id", action='store_true')
args = parser.parse_args()
get_random_docs(args.host, args.port, args.index, args.type, args.numdocs, args.only_id)
if __name__ == '__main__':
main()
#!/usr/bin/env python2
from __future__ import print_function
import argparse
import elasticsearch
from elasticsearch import helpers
from via import fix_url
def fetch_all(client, index, doc_type):
query = {'query': {'match_all': {}}}
return helpers.scan(client=client,
index=index,
doc_type=doc_type,
query=query)
def get_document_actions(index, documents):
for doc in documents:
print('document {}'.format(doc['_id']))
updates = {}
if 'favicon' in doc['_source']:
fixed_favicon = fix_url(doc['_source']['favicon'])
if fixed_favicon is not None:
updates['favicon'] = fixed_favicon
if 'link' in doc['_source']:
links = doc['_source']['link']
fixed_links, modified = _process_links(links)
if modified:
updates['link'] = fixed_links
if not updates:
continue
yield({'_op_type': 'update',
'_index': index,
'_type': 'document',
'_id': doc['_id'],
'doc': updates})
def get_annotation_actions(index, annotations):
for ann in annotations:
print('annotation {}'.format(ann['_id']))
updates = {}
if 'document' in ann['_source']:
modified = False
document = ann['_source']['document']
if 'favicon' in document:
fixed_favicon = fix_url(document['favicon'])
if fixed_favicon is not None:
document['favicon'] = fixed_favicon
modified = True
if 'link' in document:
links = document['link']
_, modified = _process_links(links)
if modified:
updates['document'] = document
if 'target' in ann['_source']:
modified = False
target = ann['_source']['target']
if not isinstance(target, list):
raise RuntimeError("target is not a list: "
"{:r}".format(target))
for item in target:
if 'source' not in item:
continue
fixed_source = fix_url(item['source'])
if fixed_source is not None:
item['source'] = fixed_source
modified = True
if modified:
updates['target'] = target
if 'uri' in ann['_source'] and ann['_source']['uri'] is not None:
uri = fix_url(ann['_source']['uri'])
if uri is not None:
updates['uri'] = uri
if not updates:
continue
yield({'_op_type': 'update',
'_index': index,
'_type': 'annotation',
'_id': ann['_id'],
'doc': updates})
def _process_links(links):
modified = False
# Deal with situations such as: {..., 'link': 'http://...', ...}
if isinstance(links, basestring):
links = [{'href': links}]
if not isinstance(links, list):
raise RuntimeError("link prop wasn't string or list: "
"{:r}".format(links))
for link in links:
if not isinstance(link, dict):
raise RuntimeError("link item wasn't a dict: "
"{:r}".format(link))
if 'href' not in link:
continue
href = link['href']
if not isinstance(href, basestring):
raise RuntimeError("link['href'] wasn't a string: "
"{:r}".format(href))
fixed_href = fix_url(href)
if fixed_href is not None:
link['href'] = fixed_href
modified = True
return links, modified
def main():
parser = argparse.ArgumentParser(
description="Update documents containing via.hypothes.is URLs")
parser.add_argument("host")
parser.add_argument("port")
parser.add_argument("index")
parser.add_argument("--skip_documents", action='store_true')
parser.add_argument("--skip_annotations", action='store_true')
args = parser.parse_args()
es = elasticsearch.Elasticsearch([{'host': args.host, 'port': args.port}])
if not args.skip_documents:
all_documents = fetch_all(es, args.index, 'document')
actions = get_document_actions(args.index, all_documents)
helpers.bulk(es, actions)
if not args.skip_annotations:
all_annotations = fetch_all(es, args.index, 'annotation')
actions = get_annotation_actions(args.index, all_annotations)
helpers.bulk(es, actions)
if __name__ == '__main__':
main()
import re
# Possible via prefixes to be stripped off. These should be given in decreasing
# order of specificity, to ensure that the longest possible via prefix is
# removed.
PREFIX_RE = '|'.join([
r'static/__shared/viewer/web/viewer\.html\?file=/h/id_/',
r'static/__shared/viewer/web/viewer\.html\?file=/id_/',
r'static/__shared/viewer/web/viewer\.html\?file=',
r'h/\d{14}oe_/',
r'h/\d{14}/',
r'h/oe_/',
r'oe_/',
r'h/',
r'\d{14}oe_/',
r'\d{14}/',
])
VIA_URL_RE = re.compile(r'https?://via.hypothes.is/(?:' + PREFIX_RE + ')?(.+)',
re.IGNORECASE)
def fix_url(url):
match = re.match(VIA_URL_RE, url)
# If this URL isn't prefixed by via at all, then we should return None,
# signifying no change:
if match is None:
return None
url = match.group(1)
# Strip off any duplicate via prefixes
while True:
match = re.match(VIA_URL_RE, url)
if match is None:
break
url = match.group(1)
# '//example.com' -> 'http://example.com'
if url.startswith('//'):
return 'http:' + url
# 'https:/example.com' -> 'https://example.com'
match = re.match(r'(https?):/([^/].*)', url, re.IGNORECASE)
if match is not None:
return match.group(1) + '://' + match.group(2)
match = re.match(r'[a-z-]+://(.*)', url, re.IGNORECASE)
if match is None:
return 'http://' + url
return url
import pytest
from fixurls.via import fix_url
FIXTURES = [
# Don't touch links to the via homepage
('https://via.hypothes.is', None),
('https://via.hypothes.is/', None),
# Don't touch all kinds of other normal links
('https://example.com/foo/bar', None),
('android-app://com.google.android.youtube/...', None),
('http://example.com/foo.pdf', None),
('https://w3c-social.github.io/activitypump/', None),
('https://example.com/https://via.hypothes.is', None),
# Normal via links
('https://via.hypothes.is/https://example.com', 'https://example.com'),
('https://via.hypothes.is/https://example.com/foo/bar',
'https://example.com/foo/bar'),
('https://via.hypothes.is/http://example.com/foo/bar',
'http://example.com/foo/bar'),
# Insecure via links
('http://via.hypothes.is/https://example.com', 'https://example.com'),
('http://via.hypothes.is/https://example.com/foo/bar',
'https://example.com/foo/bar'),
('http://via.hypothes.is/http://example.com/foo/bar',
'http://example.com/foo/bar'),
# Incorrect case via links
('http://VIA.Hypothes.IS/http://example.com/foo/bar',
'http://example.com/foo/bar'),
# Phone app links (yes, really)
('https://via.hypothes.is/android-app://com.google.android.youtube/...',
'android-app://com.google.android.youtube/...'),
# Links missing scheme
('https://via.hypothes.is/www.artbusiness.com',
'http://www.artbusiness.com'),
('https://via.hypothes.is/devblog.avdi.org/2015/',
'http://devblog.avdi.org/2015/'),
('https://via.hypothes.is/tapcore.com/...',
'http://tapcore.com/...'),
('https://via.hypothes.is///codegeekz.com/wp-content/uploads/codegeekz-favicon.png',
'http://codegeekz.com/wp-content/uploads/codegeekz-favicon.png'),
# Strange prefixes
('https://via.hypothes.is/h/http://example.com', 'http://example.com'),
('https://via.hypothes.is/oe_/http://example.com', 'http://example.com'),
('https://via.hypothes.is/h/oe_/http://example.com', 'http://example.com'),
('https://via.hypothes.is/20150520202836/http://example.com',
'http://example.com'),
('https://via.hypothes.is/20150520202836oe_/http://example.com',
'http://example.com'),
('https://via.hypothes.is/h/20150520202836/http://example.com',
'http://example.com'),
('https://via.hypothes.is/h/20150520202836oe_/http://example.com',
'http://example.com'),
('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=http://example.com/foo.pdf',
'http://example.com/foo.pdf'),
('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=/id_/http://example.com/foo.pdf',
'http://example.com/foo.pdf'),
('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=/h/id_/http://example.com/foo.pdf',
'http://example.com/foo.pdf'),
# Strange prefixes with missing scheme
('https://via.hypothes.is/h/www.example.com', 'http://www.example.com'),
('https://via.hypothes.is/20150520202836/foo.com/bar/',
'http://foo.com/bar/'),
('https://via.hypothes.is/oe_///www.example.com',
'http://www.example.com'),
# URLs with broken prefixes
('https://via.hypothes.is/https:/w3c-social.github.io/activitypump/',
'https://w3c-social.github.io/activitypump/'),
# Double prefixed
('https://via.hypothes.is/https://via.hypothes.is/http://www.nytimes.com/roomfordebate/',
'http://www.nytimes.com/roomfordebate/'),
]
@pytest.mark.parametrize('url_in,url_out', FIXTURES)
def test_fix_url(url_in, url_out):
assert fix_url(url_in) == url_out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment