Skip to content

Instantly share code, notes, and snippets.

@rachmadaniHaryono
Last active March 7, 2022 21:08
Show Gist options
  • Save rachmadaniHaryono/165cc82862315f4d1dc49abca2d915b0 to your computer and use it in GitHub Desktop.
Save rachmadaniHaryono/165cc82862315f4d1dc49abca2d915b0 to your computer and use it in GitHub Desktop.
hydrus script to replace namespace
""""
this script will replace hydrus namespace based on given parameter and hydrus tag count
tag file is hydrus tag with count number
example:
$ # replace 'https' namespace with 'url'
$ # "https://example.com" -> 'url:https://example.com'
$ hydrus_script.py replace-namespace --access_key 1234_acces_key --tag_file hydrus.txt 'https' 'url:https:'
$ # change namespace to 'tag' for unique namespace
$ hydrus_script.py replace-namespace-by-count --access_key 1234_acces_key --tag_file hydrus.txt 1 tag
"""
import logging
from hydrus import Client
from tabulate import tabulate
from tqdm import tqdm
import click
import pandas as pd
def get_dataframe(txt_file):
with open(txt_file) as f:
content = f.read()
content_df = pd.DataFrame(content.splitlines())
content_df['end part'] = content_df[0].apply(lambda x: x.rsplit('(', 1)[1].split(')')[0].replace('.', ''))
content_df['count'] = content_df['end part'].apply(lambda x: x.split('-')[0])
content_df['full count'] = content_df['end part'].apply(lambda x: x.split('-')[1] if len(x.split('-')) == 2 else None)
content_df['namespace'] = content_df[0].apply(lambda x: x.split(':', 1)[0] if len(x.split(':', 1)) == 2 else None)
content_df['value'] = content_df[0].apply(lambda x: x.split(':', 1)[1].rsplit('(', 1)[0] if len(x.split(':', 1)) == 2 else x.rsplit('(', 1)[0])
content_df = content_df.drop(0, axis=1)
content_df = content_df.drop('end part', axis=1)
return content_df
def replace_tag(tag_sets, access_key=None, add_tags=None):
if not add_tags:
add_tags = []
else:
add_tags = list(add_tags)
cl = Client(access_key)
for current_tag, new_tag in tqdm(sorted(tag_sets)):
fids = cl.search_files([current_tag])
if not fids:
# TODO change to debug
# print('tag:{}\nnew_tag:{}'.format(current_tag, new_tag))
# print('no file')
continue
else:
tqdm.write('tag:{}\nnew_tag:{}'.format(current_tag, new_tag))
try:
fmds = cl.file_metadata(file_ids=fids, only_identifiers=True)
cl.add_tags([x['hash'] for x in fmds], services_to_actions={'local tags': {
'0': [new_tag] + add_tags, '1': [current_tag]
}})
except Exception as err:
tqdm.write('tag:{}\nerror:{}'.format(current_tag, err))
@click.group()
def cli():
pass
@cli.command()
@click.argument('tag_file')
def print_namespace_ranking(tag_file):
content_df = get_dataframe(tag_file)
print(tabulate(content_df['namespace'].value_counts().to_frame()))
@cli.command()
@click.argument('tag_file')
@click.argument('--namespace')
def print_tag(tag_file, namespace=None):
content_df = get_dataframe(tag_file)
if namespace:
print(tabulate(content_df[content_df['namespace'] == 'subitlte']))
else:
print(tabulate(content_df))
@cli.command()
@click.argument('target_namespace')
@click.argument('prepend_text')
@click.option('--tag_file')
@click.option('--access_key')
@click.option('--add_tag', multiple=True)
def replace_namespace(target_namespace, prepend_text, tag_file=None, access_key=None, add_tag=None):
content_df = get_dataframe(tag_file)
match_df = content_df[content_df['namespace']==target_namespace]['value'].to_list()
raw_values = zip(match_df['namespace'].to_list(), match_df['value'])
values = []
for x in raw_values:
current_tag = '{}:{}'.format(x[0], x[1]).strip()
new_tag = '{}{}'.format(prepend_text, x[1]).strip()
values.append((current_tag, new_tag))
replace_tag(values, access_key, add_tag)
@cli.command()
@click.argument('count')
@click.argument('prepend_text')
@click.option('--tag_file')
@click.option('--access_key')
@click.option('--add_tag', multiple=True)
def replace_namespace_by_count(count, prepend_text, tag_file=None, access_key=None, add_tag=None):
content_df = get_dataframe(tag_file)
namespaces_df = content_df['namespace'].value_counts().to_frame()
namespaces = namespaces_df[namespaces_df['namespace'] == int(count)].to_dict()['namespace'].keys()
content_df['match'] = content_df['namespace'].apply(lambda x:x in namespaces)
match_df = content_df[content_df['match'] == True]
raw_values = zip(match_df['namespace'].to_list(), match_df['value'])
values = []
for x in raw_values:
current_tag = '{}:{}'.format(x[0], x[1]).strip()
new_tag = '{}{}:{}'.format(prepend_text, x[0], x[1]).strip()
values.append((current_tag, new_tag))
replace_tag(values, access_key, add_tag)
if __name__ == '__main__':
cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment