Created
June 13, 2014 23:03
-
-
Save RichMorin/736e2e9014b41a3629ef to your computer and use it in GitHub Desktop.
redo_nodes - redo node sets into TSV data rows
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dbpr_!Kheis_Local_Municipality || | |
dbpr_!Women_Art_Revolution || | |
dbpr_"A,"_My_Name_is_Alex_-_Parts_I_&_II || | |
dbpr_"Awesome" || | |
dbpr_"Banksy_of_Bulgaria" || | |
dbpr_"Big"_Brian_Subich || | |
dbpr_"Big"_Paul_Williams || | |
dbpr_"C"_Is_for_(Please_Insert_Sophomoric_Genitalia_Reference_Here) || | |
dbpr_"D"_Is_for_Dubby_–_The_Lustmord_Dub_Mixes || | |
dbpr_"Dor-en-Ernil" || | |
dbpr_"Go_Away" || | |
dbpr_"Gypsy"_in_Jazz || | |
dbpr_"Happy"_in_Galoshes || | |
dbpr_"In"_Jazz_for_the_Culture_Set || | |
dbpr_"Irish"_Teddy_Mann || | |
dbpr_"Isis"_of_the_Suebi || | |
dbpr_"King_Ernest"_Baker || | |
dbpr_"North_Shore_Railroad_(California) || | |
dbpr_"Oh_Yeah!"_Live || | |
dbpr_"R"_word || | |
dbpr_"Ridgeriders"_In_Concert || | |
dbpr_"Rommel?"_"Gunner_Who?" || | |
dbpr_"Singles" || | |
dbpr_"Southern_New_Jersey_Railroad" || | |
dbpr_"Sunshine"_Sonny_Payne || | |
dbpr_"The_Above_Ground_Sound"_of_Jake_Holmes || | |
dbpr_"The_Spaghetti_Incident?" || | |
dbpr_"U"_Is_for_Undertow || | |
dbpr_"Uncle_Tom's_Cabin"_Contrasted_with_Buckingham_Hall,_the_Planter's_Home || | |
dbpr_"Unplugged"_Live || | |
dbpr_"V"_Is_for_Vengeance || | |
dbpr_"Weird_Al"_Yankovic || | |
dbpr_"Weird_Al"_Yankovic_Live!_–_The_Alpocalypse_Tour || | |
dbpr_"Welding"_Kumar || | |
dbpr_"Wesleyan_Methodist_College" || | |
dbpr_"i" || | |
dbpr_$O$ || | |
dbpr_$_(Mark_Sultan_album) || | |
dbpr_$h*!_My_Dad_Says || | |
dbpr_$uga(r) || | |
dbpr_&_I_Made_A_Man || | |
dbpr_&_Then_Boom || | |
dbpr_''Same_Team''_Fallacy || | |
dbpr_'A_morte_'e_Carnevale || | |
dbpr_'Abd_Allah_ibn_'Amr_ibn_al-'As || | |
dbpr_'Abd_al-Razzaq_al-Hasani || | |
dbpr_'Abd_as-Sattar_Qasm || | |
dbpr_'Ajde_Jano || | |
dbpr_'Ali-Sultan || | |
dbpr_'Ali_ibn_al-Husayn_ibn_Quraysh || | |
dbpr_'Amanave || | |
dbpr_'Amr_III_ibn_al-Mundhir || | |
dbpr_'Amr_ibn_Adi || | |
dbpr_'Amr_ibn_Imru'_al-Qays || | |
dbpr_'Amr_ibn_al-'As || | |
dbpr_'Ana_Po'uhila || | |
dbpr_'Anin || | |
dbpr_'Aoa || | |
dbpr_'Arab_al-Jahalin || | |
dbpr_'Arab_al-Rashayida || | |
dbpr_'Au'asi || | |
dbpr_'Azazme || | |
dbpr_'Aziz_'Ali_al-Misri || | |
dbpr_'Bout_Changes_'n'_Things || | |
dbpr_'Bout_Love || | |
dbpr_'Bout_Soul || | |
dbpr_'Deed_I_Do || | |
dbpr_'Disco'_La_Passione || | |
dbpr_'Elisiva_Fusipala_Vaha'i || | |
dbpr_'Em_Are_I || | |
dbpr_'F'_Debut || | |
dbpr_'Galway_Joe'_Dolan || | |
dbpr_'How's_my_driving?'_sign || | |
dbpr_'Ili'ili || | |
dbpr_'It's_Alive!' || | |
dbpr_'Makholane || | |
dbpr_'Malakeng || | |
dbpr_'Mamants'O || | |
dbpr_'Moteng || | |
dbpr_'N_Sync || | |
dbpr_'N_Sync_in_Concert || | |
dbpr_'Neath_Austral_Skies || | |
dbpr_'O'ua || | |
dbpr_'Ole_language || | |
dbpr_'Op_o'_Me_Thumb || | |
dbpr_'Orfi_Shirazi || | |
dbpr_'Oro || | |
dbpr_'Ota_'ika || | |
dbpr_'Punnagai_Poo'_Gheetha || | |
dbpr_'R_Xmas || | |
dbpr_'Round_About_Midnight_at_the_Cafe_Bohemia || | |
dbpr_'Round_Here || | |
dbpr_'SUP_Magazine || | |
dbpr_'S_Wonderful_(album) || | |
dbpr_'Splosion_Man || | |
dbpr_'The_All-Species_Living_Tree'_Project || | |
dbpr_'The_Masterwork'_Award_Winning_Fish-Knife || | |
dbpr_'Til_Death_Do_Us_Party || | |
dbpr_'Til_I_Can_Make_It_on_My_Own || | |
dbpr_'Til_I_Gain_Control_Again || |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env julia | |
# | |
# redo_nodes - redo node sets into TSV data rows | |
# | |
# (WIP) Transliteration of redo_nodes (and bi_prep.rb) into Julia. | |
# | |
# Pkg.add("YAML") | |
# | |
# Written by Rich Morin, CFCL, 2014 | |
require("YAML") | |
function from_repl() | |
# | |
# This function allows redo_nodes to be called from the Julia REPL. | |
println("from_repl") #T | |
global inp_dir, inp_file, out_dir, out_file | |
inp_dir, inp_file, out_dir, out_file = ".", "oT.t2", ".", "oT.t2j" | |
redo_nodes() | |
end | |
function from_shell() | |
# | |
# This function allows redo_nodes to be called from the shell | |
# (eg, via the "shebang line" at the top of the file). | |
println("from_shell") #T | |
global inp_dir, inp_file, out_dir, out_file | |
if (length(ARGS) != 4) | |
arg_list = "<inp_dir> <inp_file> <out_dir> <out_file>" | |
@printf("Usage: redo_nodes %s\n", arg_list) | |
exit() | |
end | |
inp_dir, inp_file, out_dir, out_file = ARGS | |
redo_nodes() | |
end | |
function redo_nodes() | |
# | |
# Informal call tree: | |
# | |
# redo_nodes | |
# | setup_run | |
# | | get_pred_info | |
# | do_node_file | |
# | | do_node_set | |
# | | | line_get | |
# | | | line_unget | |
println("redo_nodes") #T | |
setup_run() | |
do_node_file() | |
end | |
function do_node_file() | |
# | |
# Walk the nodes.t2 file, emitting merged TSV records. | |
# Loop while input has something to offer. | |
println("do_node_file") #T | |
global f_nodes_inp, f_nodes_out, inp_dir, inp_file, out_dir, out_file | |
path_inp = "$( inp_dir )/$( inp_file )" | |
path_out = "$( out_dir )/$( out_file )" | |
f_nodes_inp = open(path_inp) | |
f_nodes_out = open(path_out, "w") | |
while (node = do_node_set() ) != "" | |
if false #T | |
println("\nnode: $( node )") #T | |
end | |
end | |
end | |
function do_node_set() | |
# | |
# Handle the next node (including any properties). | |
# Emit a record to nodes.csv | |
# println("do_node_set") #T | |
global parm_hash, prop_list, re_split | |
parm_hash = Dict{Any,Any}() | |
re_trim = r"^'([^']+)'.*$" | |
# Get first line of node set. | |
line = line_get() | |
(line == "___EOF___ || ") && return "" | |
ns_name = replace(line, re_split, s -> "") | |
while true | |
line = line_get() | |
split_arr = split(line, re_split) | |
split_len = length(split_arr) | |
subj = split_arr[1] | |
if (split_len > 2) # Prop line: process and save info. | |
pred, obj = split_arr[2], split_arr[3] | |
if (subj != ns_name) #T | |
println("Warning: '$( subj )' != '$( ns_name )'") | |
@printf("sizes: %d, %d\n", length(subj), length(ns_name)) | |
end | |
m = match(re_trim, obj) | |
parm_hash[pred] = (m == nothing) ? obj : m.captures[1] | |
else # Node line: emit results and return. | |
line_unget(line) | |
ns, name = split(ns_name, '_', 2) | |
if false #D | |
@printf(f_nodes_out, "%s\n", | |
join(vcat([ ns_name, ns, name ], | |
map(key -> get(parm_hash, key, ""), prop_list)), | |
fs_tsv)) | |
else | |
data_list = map(key -> get(parm_hash, key, ""), prop_list) | |
# data_list = [ get(parm_hash, key, "") for key in prop_list ] # feh! | |
full_list = vcat([ ns_name, ns, name ], data_list) | |
full_str = join(full_list, fs_tsv) | |
if false #D | |
@printf("data_list: %s\n", typeof(data_list) ) | |
@printf("full_list: %s\n", typeof(full_list) ) | |
@printf("full_str: %s\n", typeof(full_str ) ) | |
@printf("parm_hash: %s\n", typeof(parm_hash) ) | |
@printf("prop_list: %s\n", typeof(prop_list) ) | |
exit() | |
end | |
@printf(f_nodes_out, "%s\n", full_str) | |
end | |
return ns_name | |
end | |
end | |
end | |
function get_pred_info() | |
# | |
# Get predicate information from the YAML file. | |
println("get_pred_info") #T | |
global prop_list | |
prop_types = { | |
"__" => "string", | |
"bo" => "boolean", | |
"by" => "byte", | |
"ch" => "char", | |
"do" => "double", # We use this for all floating point values. | |
"fl" => "float", | |
"in" => "int", # We use this for all integer values. | |
"lo" => "long", | |
"sh" => "short" | |
} | |
raw_path = string(@__FILE__, "/../predicates.yaml") | |
yaml_path = normpath(raw_path) | |
yaml_data = YAML.load( open(yaml_path) ) | |
pred_hash = yaml_data["Predicates"] | |
pred_info = Dict{Any,Any}() | |
pred_keys = sort( collect( keys(pred_hash) ) ) | |
prop_list = filter(k -> pred_hash[k][1] == "P", pred_keys) | |
for key in pred_keys | |
list = pred_hash[key] | |
info = pred_info[key] = Dict{Any,Any}() | |
info[:want_prop] = (list[1] == "P") | |
info[:prop_type] = prop_types[ list[2] ] | |
info[:index_me] = (list[3] == "I") | |
info[:exp_text] = list[4] | |
end | |
if false #T | |
@printf("pred_info: '%s'\n", pred_info) | |
@printf("pred_keys: '%s'\n", pred_keys) | |
@printf("prop_list: '%s'\n", prop_list) | |
# @printf("yaml_data: '%s'\n", yaml_data) | |
end | |
end | |
function line_get() | |
# | |
# Get a line from the nodes file. | |
# println("line_get") #T | |
global f_nodes_inp, line_cache | |
if line_cache::String != "" | |
this_line = line_cache::String | |
line_cache::String = "" | |
return this_line | |
end | |
if eof(f_nodes_inp) | |
this_line = "___EOF___ || " | |
else | |
this_line = chomp( readline(f_nodes_inp) ) | |
end | |
this_line | |
end | |
function line_unget(line) | |
# | |
# Unget a line from the nodes file. | |
# println("line_unget") #T | |
global line_cache | |
line_cache::String = line | |
end | |
function setup_run() | |
# | |
# Set up assorted instance variables for the run. | |
println("setup_run") #T | |
global fs_tmp, fs_tsv, lin_lim, line_cache, re_split | |
const fs_tmp = " || " | |
const re_split = r" \|\| " | |
line_cache = "" | |
cvt_prod = get(ENV, "CVT_PROD", nothing) | |
run_mode = (cvt_prod == "Y") ? :production : :debug | |
if (run_mode == :debug) #D | |
# const fs_tsv = "|" | |
# const fs_tsv = " | " | |
const fs_tsv = "\t" | |
const lin_lim = 1e4 | |
else | |
const fs_tsv = "\t" | |
const lin_lim = 1e9 | |
end | |
get_pred_info() | |
end | |
from_shell() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment