Skip to content

Instantly share code, notes, and snippets.

@elnikkis
Created July 12, 2019 10:55
Show Gist options
  • Save elnikkis/09cd2e2375800c9fad5b7b7434d60bf4 to your computer and use it in GitHub Desktop.
Save elnikkis/09cd2e2375800c9fad5b7b7434d60bf4 to your computer and use it in GitHub Desktop.
ノードID変換
# -*- coding: utf-8 -*-
'''
edgelistとlabellistを0始まりのIDに変換する
'''
import os
import sys
#from snlocest.largedict import LargeDict
def parse_args():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('mode', help='処理のモード')
parser.add_argument('--tablepath', required=True)
parser.add_argument('inputfiles', nargs='+')
return parser.parse_args()
def load_table(filepath):
user2id = dict() #LargeDict()
with open(filepath, 'r') as fd:
for line in fd:
user_id, idx = line.rstrip().split('\t')
user2id[user_id] = int(idx)
idcnt = max(user2id.values()) + 1
return user2id, idcnt
if __name__ == '__main__':
args = parse_args()
if args.mode == 'table':
# Generate convert table from inputfiles and save to tablepath
if os.path.exists(args.tablepath):
print('Table path already exists', file=sys.stderr)
sys.exit(1)
# create table
user2id = dict() #LargeDict()
idcnt = 0
for filepath in args.inputfiles:
with open(filepath, 'r') as inputfile:
for line in inputfile:
tokens = line.rstrip().split('\t')
user_id = tokens[0]
if user_id not in user2id:
user2id[user_id] = idcnt
idcnt += 1
user_id = tokens[1]
if user_id not in user2id:
user2id[user_id] = idcnt
idcnt += 1
# save table
with open(args.tablepath, 'w') as fd:
for k, v in user2id.items():
print(k, v, sep='\t', file=fd)
elif args.mode == 'edgelist':
# Convert edgelist using table
user2id, idcnt = load_table(args.tablepath)
for filepath in args.inputfiles:
with open(filepath, 'r') as fd:
for line in fd:
row = line.rstrip().split('\t')
src = row[0]
dst = row[1]
print(user2id[src], user2id[dst], *row[2:], sep='\t')
elif args.mode == 'label':
# Convert label file
user2id, idcnt = load_table(args.tablepath)
for filepath in args.inputfiles:
with open(filepath, 'r') as fd:
for line in fd:
row = line.rstrip().split('\t')
label = row[0]
print(user2id[label], *row[1:], sep='\t')
else:
print('Invalid mode. Choose "table" or "edgelist" or "label"', file=sys.stderr)
use std::io;
use std::io::prelude::*;
use std::collections::HashMap;
use std::hash;
use std::env;
use std::convert::TryInto;
use std::fmt::Display;
use std::fs::File;
trait NodeToIndex<K> where K: Eq + hash::Hash + Display {
fn to_index(&mut self, node: K) -> u128;
}
fn parse_edge(line: &str) -> (String, String) {
let mut iter = line.split("\t");
//TODO ここでsplitができたかエラー処理
(iter.next().unwrap().to_string(), iter.next().unwrap().to_string())
}
impl<K> NodeToIndex<K> for HashMap<K, u128> where K: Eq + hash::Hash + Display {
fn to_index(&mut self, node: K) -> u128 {
match self.get(&node) {
Some(&index) => index,
None => {
let index: u128 = self.len().try_into().unwrap();
self.insert(node, index);
index
}
}
}
}
struct Config {
converted_filename: String,
table_filename: String,
}
impl Config {
fn new(mut args: env::Args) -> Config {
args.next(); // skip program name
let converted_filename = args.next().expect("Missing converted filename");
let table_filename = args.next().expect("Missing table filename");
Config { converted_filename, table_filename }
}
}
fn main() {
let config = Config::new(env::args());
let stdout = io::stdout();
let mut edge_writer: Box<dyn Write> = match config.converted_filename.as_ref() {
"-" => {
Box::new(stdout.lock())
},
_ => Box::new(File::create("assets/converted_edgelist.tsv").expect("Cannot create a file."))
};
let mut table_writer: Box<dyn Write> = match config.table_filename.as_ref() {
"-" => {
Box::new(stdout.lock())
},
_ => Box::new(File::create("assets/convert_table.tsv").expect("Cannot create a file."))
};
let mut converter = HashMap::new();
for line in io::stdin().lock().lines() {
let (src_node, dst_node) = parse_edge(&line.unwrap());
let src_id = converter.to_index(src_node);
let dst_id = converter.to_index(dst_node);
writeln!(edge_writer, "{}\t{}", src_id, dst_id).unwrap();
}
for (key, value) in converter.iter() {
writeln!(table_writer, "{}\t{}", key, value).unwrap();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment