Skip to content

Instantly share code, notes, and snippets.

@aammd
Last active March 8, 2017 18:02
Show Gist options
  • Save aammd/39d05299686a59c00e2e2612ea995ddb to your computer and use it in GitHub Desktop.
Save aammd/39d05299686a59c00e2e2612ea995ddb to your computer and use it in GitHub Desktop.
no longer needful as a script to clean the data
BEGIN {
fam = 0
parasite_genus = 0
host_genus = 0
parasite_species = 0
print "Host_fam\tHost_genus\tHost_sp\tParasite_genus\tParasite_sp\tk_i\tLocation\tParasite_fam"
}
/^[A-Z]/ {fam = $1}
/^\t[A-Z]/ {
host_genus = $1
if ($2 ~ /\(.*\)/)
host_species = $3
else
host_species = $2
match($0, /.*(\(.*\))$/, loc)
}
/^\t\t[A-Z]/ {
# the parasite family in in brackets; may or may not be at end of line
match($0, /.*(\(.*\)).?/, parasite_fam)
# A full parasite genus name is written in sentence case
if ($1 ~ /[A-Z][a-z]/) parasite_genus = $1
# koenobiont or idiobiont or question mark
match($0, /([ki\?]+?$)/, ki)
match($0, /.*([0-9]).*/, parasite_num)
parasite_sp_name = $2 parasite_num[1]
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", fam, host_genus, host_species, parasite_genus, parasite_sp_name, ki[1], loc[1], parasite_fam[1]
}
BEGIN {
fam = 0
parasite_genus = 0
host_genus = 0
parasite_species = 0
print "Host_fam\tHost_genus\tHost_sp\tParasite_genus\tParasite_sp\tk_i\tLocation"
}
/^[A-Z]/ {fam = $1}
/^\(?.*\t[A-Z]/{
parasite_genus = $2
if ($3 ~ /\(.*\)/)
parasite_species = $4
else
parasite_species = $3
match($0, /.*(\(.*\))$/, loc)
}
/^\t\t[A-Z]/ {
if ($1 ~ /[A-Z][a-z]/) host_genus = $1
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", fam, parasite_genus, parasite_species, host_genus, $2, $NF, loc[1]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment