Created
July 12, 2019 10:55
-
-
Save elnikkis/09cd2e2375800c9fad5b7b7434d60bf4 to your computer and use it in GitHub Desktop.
ノードID変換
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
''' | |
edgelistとlabellistを0始まりのIDに変換する | |
''' | |
import os | |
import sys | |
#from snlocest.largedict import LargeDict | |
def parse_args(): | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('mode', help='処理のモード') | |
parser.add_argument('--tablepath', required=True) | |
parser.add_argument('inputfiles', nargs='+') | |
return parser.parse_args() | |
def load_table(filepath): | |
user2id = dict() #LargeDict() | |
with open(filepath, 'r') as fd: | |
for line in fd: | |
user_id, idx = line.rstrip().split('\t') | |
user2id[user_id] = int(idx) | |
idcnt = max(user2id.values()) + 1 | |
return user2id, idcnt | |
if __name__ == '__main__': | |
args = parse_args() | |
if args.mode == 'table': | |
# Generate convert table from inputfiles and save to tablepath | |
if os.path.exists(args.tablepath): | |
print('Table path already exists', file=sys.stderr) | |
sys.exit(1) | |
# create table | |
user2id = dict() #LargeDict() | |
idcnt = 0 | |
for filepath in args.inputfiles: | |
with open(filepath, 'r') as inputfile: | |
for line in inputfile: | |
tokens = line.rstrip().split('\t') | |
user_id = tokens[0] | |
if user_id not in user2id: | |
user2id[user_id] = idcnt | |
idcnt += 1 | |
user_id = tokens[1] | |
if user_id not in user2id: | |
user2id[user_id] = idcnt | |
idcnt += 1 | |
# save table | |
with open(args.tablepath, 'w') as fd: | |
for k, v in user2id.items(): | |
print(k, v, sep='\t', file=fd) | |
elif args.mode == 'edgelist': | |
# Convert edgelist using table | |
user2id, idcnt = load_table(args.tablepath) | |
for filepath in args.inputfiles: | |
with open(filepath, 'r') as fd: | |
for line in fd: | |
row = line.rstrip().split('\t') | |
src = row[0] | |
dst = row[1] | |
print(user2id[src], user2id[dst], *row[2:], sep='\t') | |
elif args.mode == 'label': | |
# Convert label file | |
user2id, idcnt = load_table(args.tablepath) | |
for filepath in args.inputfiles: | |
with open(filepath, 'r') as fd: | |
for line in fd: | |
row = line.rstrip().split('\t') | |
label = row[0] | |
print(user2id[label], *row[1:], sep='\t') | |
else: | |
print('Invalid mode. Choose "table" or "edgelist" or "label"', file=sys.stderr) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::io; | |
use std::io::prelude::*; | |
use std::collections::HashMap; | |
use std::hash; | |
use std::env; | |
use std::convert::TryInto; | |
use std::fmt::Display; | |
use std::fs::File; | |
trait NodeToIndex<K> where K: Eq + hash::Hash + Display { | |
fn to_index(&mut self, node: K) -> u128; | |
} | |
fn parse_edge(line: &str) -> (String, String) { | |
let mut iter = line.split("\t"); | |
//TODO ここでsplitができたかエラー処理 | |
(iter.next().unwrap().to_string(), iter.next().unwrap().to_string()) | |
} | |
impl<K> NodeToIndex<K> for HashMap<K, u128> where K: Eq + hash::Hash + Display { | |
fn to_index(&mut self, node: K) -> u128 { | |
match self.get(&node) { | |
Some(&index) => index, | |
None => { | |
let index: u128 = self.len().try_into().unwrap(); | |
self.insert(node, index); | |
index | |
} | |
} | |
} | |
} | |
struct Config { | |
converted_filename: String, | |
table_filename: String, | |
} | |
impl Config { | |
fn new(mut args: env::Args) -> Config { | |
args.next(); // skip program name | |
let converted_filename = args.next().expect("Missing converted filename"); | |
let table_filename = args.next().expect("Missing table filename"); | |
Config { converted_filename, table_filename } | |
} | |
} | |
fn main() { | |
let config = Config::new(env::args()); | |
let stdout = io::stdout(); | |
let mut edge_writer: Box<dyn Write> = match config.converted_filename.as_ref() { | |
"-" => { | |
Box::new(stdout.lock()) | |
}, | |
_ => Box::new(File::create("assets/converted_edgelist.tsv").expect("Cannot create a file.")) | |
}; | |
let mut table_writer: Box<dyn Write> = match config.table_filename.as_ref() { | |
"-" => { | |
Box::new(stdout.lock()) | |
}, | |
_ => Box::new(File::create("assets/convert_table.tsv").expect("Cannot create a file.")) | |
}; | |
let mut converter = HashMap::new(); | |
for line in io::stdin().lock().lines() { | |
let (src_node, dst_node) = parse_edge(&line.unwrap()); | |
let src_id = converter.to_index(src_node); | |
let dst_id = converter.to_index(dst_node); | |
writeln!(edge_writer, "{}\t{}", src_id, dst_id).unwrap(); | |
} | |
for (key, value) in converter.iter() { | |
writeln!(table_writer, "{}\t{}", key, value).unwrap(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment