rachmadaniHaryono/hydrus_script.py

## hydrus_script.py
""""
this script will replace hydrus namespace based on given parameter and hydrus tag count

tag file is hydrus tag with count number

example:

$ # replace 'https' namespace with 'url'
$ # "https://example.com" -> 'url:https://example.com'
$ hydrus_script.py replace-namespace --access_key 1234_acces_key --tag_file hydrus.txt 'https' 'url:https:'
$ # change namespace to 'tag' for unique namespace
$ hydrus_script.py replace-namespace-by-count --access_key 1234_acces_key --tag_file hydrus.txt 1 tag

"""
import logging

from hydrus import Client
from tabulate import tabulate
from tqdm import tqdm
import click
import pandas as pd


def get_dataframe(txt_file):
    with open(txt_file) as f:
        content = f.read()
    content_df = pd.DataFrame(content.splitlines())
    content_df['end part'] = content_df[0].apply(lambda x: x.rsplit('(', 1)[1].split(')')[0].replace('.', ''))
    content_df['count'] = content_df['end part'].apply(lambda x: x.split('-')[0])
    content_df['full count'] = content_df['end part'].apply(lambda x: x.split('-')[1] if len(x.split('-')) == 2 else None)
    content_df['namespace'] = content_df[0].apply(lambda x: x.split(':', 1)[0] if len(x.split(':', 1)) == 2 else None)
    content_df['value'] = content_df[0].apply(lambda x: x.split(':', 1)[1].rsplit('(', 1)[0] if len(x.split(':', 1)) == 2 else x.rsplit('(', 1)[0])
    content_df = content_df.drop(0, axis=1)
    content_df = content_df.drop('end part', axis=1)
    return content_df


def replace_tag(tag_sets, access_key=None, add_tags=None):
    if not add_tags:
        add_tags = []
    else:
        add_tags = list(add_tags)
    cl = Client(access_key)
    for current_tag, new_tag in tqdm(sorted(tag_sets)):
        fids = cl.search_files([current_tag])
        if not fids:
            # TODO change to debug
            #  print('tag:{}\nnew_tag:{}'.format(current_tag, new_tag))
            #  print('no file')
            continue
        else:
            tqdm.write('tag:{}\nnew_tag:{}'.format(current_tag, new_tag))
        try:
            fmds = cl.file_metadata(file_ids=fids, only_identifiers=True)
            cl.add_tags([x['hash'] for x in fmds], services_to_actions={'local tags': {
                '0': [new_tag] + add_tags, '1': [current_tag]
            }})
        except Exception as err:
            tqdm.write('tag:{}\nerror:{}'.format(current_tag, err))


@click.group()
def cli():
    pass


@cli.command()
@click.argument('tag_file')
def print_namespace_ranking(tag_file):
    content_df = get_dataframe(tag_file)
    print(tabulate(content_df['namespace'].value_counts().to_frame()))


@cli.command()
@click.argument('tag_file')
@click.argument('--namespace')
def print_tag(tag_file, namespace=None):
    content_df = get_dataframe(tag_file)
    if namespace:
        print(tabulate(content_df[content_df['namespace'] == 'subitlte']))
    else:
        print(tabulate(content_df))


@cli.command()
@click.argument('target_namespace')
@click.argument('prepend_text')
@click.option('--tag_file')
@click.option('--access_key')
@click.option('--add_tag', multiple=True)
def replace_namespace(target_namespace, prepend_text, tag_file=None, access_key=None, add_tag=None):
    content_df = get_dataframe(tag_file)

    match_df = content_df[content_df['namespace']==target_namespace]['value'].to_list()
    raw_values = zip(match_df['namespace'].to_list(), match_df['value'])
    values = []
    for x in raw_values:
        current_tag = '{}:{}'.format(x[0], x[1]).strip()
        new_tag = '{}{}'.format(prepend_text, x[1]).strip()
        values.append((current_tag, new_tag))
    replace_tag(values, access_key, add_tag)


@cli.command()
@click.argument('count')
@click.argument('prepend_text')
@click.option('--tag_file')
@click.option('--access_key')
@click.option('--add_tag', multiple=True)
def replace_namespace_by_count(count, prepend_text, tag_file=None, access_key=None, add_tag=None):
    content_df = get_dataframe(tag_file)

    namespaces_df =  content_df['namespace'].value_counts().to_frame()
    namespaces = namespaces_df[namespaces_df['namespace'] == int(count)].to_dict()['namespace'].keys()
    content_df['match'] = content_df['namespace'].apply(lambda x:x in namespaces)
    match_df = content_df[content_df['match'] == True]
    raw_values = zip(match_df['namespace'].to_list(), match_df['value'])
    values = []
    for x in raw_values:
        current_tag = '{}:{}'.format(x[0], x[1]).strip()
        new_tag = '{}{}:{}'.format(prepend_text, x[0], x[1]).strip()
        values.append((current_tag, new_tag))
    replace_tag(values, access_key, add_tag)


if __name__ == '__main__':
    cli()
	""""
	this script will replace hydrus namespace based on given parameter and hydrus tag count

	tag file is hydrus tag with count number

	example:

	$ # replace 'https' namespace with 'url'
	$ # "https://example.com" -> 'url:https://example.com'
	$ hydrus_script.py replace-namespace --access_key 1234_acces_key --tag_file hydrus.txt 'https' 'url:https:'
	$ # change namespace to 'tag' for unique namespace
	$ hydrus_script.py replace-namespace-by-count --access_key 1234_acces_key --tag_file hydrus.txt 1 tag

	"""
	import logging

	from hydrus import Client
	from tabulate import tabulate
	from tqdm import tqdm
	import click
	import pandas as pd


	def get_dataframe(txt_file):
	with open(txt_file) as f:
	content = f.read()
	content_df = pd.DataFrame(content.splitlines())
	content_df['end part'] = content_df[0].apply(lambda x: x.rsplit('(', 1)[1].split(')')[0].replace('.', ''))
	content_df['count'] = content_df['end part'].apply(lambda x: x.split('-')[0])
	content_df['full count'] = content_df['end part'].apply(lambda x: x.split('-')[1] if len(x.split('-')) == 2 else None)
	content_df['namespace'] = content_df[0].apply(lambda x: x.split(':', 1)[0] if len(x.split(':', 1)) == 2 else None)
	content_df['value'] = content_df[0].apply(lambda x: x.split(':', 1)[1].rsplit('(', 1)[0] if len(x.split(':', 1)) == 2 else x.rsplit('(', 1)[0])
	content_df = content_df.drop(0, axis=1)
	content_df = content_df.drop('end part', axis=1)
	return content_df


	def replace_tag(tag_sets, access_key=None, add_tags=None):
	if not add_tags:
	add_tags = []
	else:
	add_tags = list(add_tags)
	cl = Client(access_key)
	for current_tag, new_tag in tqdm(sorted(tag_sets)):
	fids = cl.search_files([current_tag])
	if not fids:
	# TODO change to debug
	# print('tag:{}\nnew_tag:{}'.format(current_tag, new_tag))
	# print('no file')
	continue
	else:
	tqdm.write('tag:{}\nnew_tag:{}'.format(current_tag, new_tag))
	try:
	fmds = cl.file_metadata(file_ids=fids, only_identifiers=True)
	cl.add_tags([x['hash'] for x in fmds], services_to_actions={'local tags': {
	'0': [new_tag] + add_tags, '1': [current_tag]
	}})
	except Exception as err:
	tqdm.write('tag:{}\nerror:{}'.format(current_tag, err))


	@click.group()
	def cli():
	pass


	@cli.command()
	@click.argument('tag_file')
	def print_namespace_ranking(tag_file):
	content_df = get_dataframe(tag_file)
	print(tabulate(content_df['namespace'].value_counts().to_frame()))


	@cli.command()
	@click.argument('tag_file')
	@click.argument('--namespace')
	def print_tag(tag_file, namespace=None):
	content_df = get_dataframe(tag_file)
	if namespace:
	print(tabulate(content_df[content_df['namespace'] == 'subitlte']))
	else:
	print(tabulate(content_df))


	@cli.command()
	@click.argument('target_namespace')
	@click.argument('prepend_text')
	@click.option('--tag_file')
	@click.option('--access_key')
	@click.option('--add_tag', multiple=True)
	def replace_namespace(target_namespace, prepend_text, tag_file=None, access_key=None, add_tag=None):
	content_df = get_dataframe(tag_file)

	match_df = content_df[content_df['namespace']==target_namespace]['value'].to_list()
	raw_values = zip(match_df['namespace'].to_list(), match_df['value'])
	values = []
	for x in raw_values:
	current_tag = '{}:{}'.format(x[0], x[1]).strip()
	new_tag = '{}{}'.format(prepend_text, x[1]).strip()
	values.append((current_tag, new_tag))
	replace_tag(values, access_key, add_tag)



	@cli.command()
	@click.argument('count')
	@click.argument('prepend_text')
	@click.option('--tag_file')
	@click.option('--access_key')
	@click.option('--add_tag', multiple=True)
	def replace_namespace_by_count(count, prepend_text, tag_file=None, access_key=None, add_tag=None):
	content_df = get_dataframe(tag_file)

	namespaces_df = content_df['namespace'].value_counts().to_frame()
	namespaces = namespaces_df[namespaces_df['namespace'] == int(count)].to_dict()['namespace'].keys()
	content_df['match'] = content_df['namespace'].apply(lambda x:x in namespaces)
	match_df = content_df[content_df['match'] == True]
	raw_values = zip(match_df['namespace'].to_list(), match_df['value'])
	values = []
	for x in raw_values:
	current_tag = '{}:{}'.format(x[0], x[1]).strip()
	new_tag = '{}{}:{}'.format(prepend_text, x[0], x[1]).strip()
	values.append((current_tag, new_tag))
	replace_tag(values, access_key, add_tag)


	if __name__ == '__main__':
	cli()