Last active
January 19, 2024 10:17
-
-
Save msoutopico/4c2f070aab26d42bbb2ab8f69806eae3 to your computer and use it in GitHub Desktop.
get_omt_alt_data.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# steps: | |
# 03_VERIF_from_Ver | |
# 04_VERIF_rev_del | |
# 05_FC_IN | |
# 06_FC_to_VER | |
# 07_FC_from_VER | |
# 08_FC_rev_del | |
# 09_POST-FC_requests | |
# 10_POST-SDS | |
# 10_POST-SDS_request | |
# 1. extract project_save and name it as the project | |
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/05_FC_IN/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done | |
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/06_FC_to_VER/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done | |
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/07_FC_from_VER/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done | |
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/08_FC_rev_del/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done | |
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/09_POST-FC_requests/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done | |
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/10_POST-SDS/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done | |
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/10_POST-SDS_requests/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done | |
# cd working dir | |
cd /home/manuel/pisa25/tred-transfer-issues/working_tms | |
# 2. remove doctype which xmlstarlet chokes at | |
grep -rl 'DOCTYPE' *.tmx | xargs sed -i '/DOCTYPE/d' | |
# 3. extract alt tu nodes | |
for f in $(find * -name "*.tmx"); do fn=$(basename $f .tmx); ls -ls $f; if [ ! -s $f ]; then continue; fi; xmlstarlet select -t -c "//tu[prop[contains(@type, 'id')] and tuv[@lang='en-ZZ']/seg/text()!=tuv[@lang!='en-ZZ']/seg/text() and tuv[@lang='en-ZZ']/seg[translate(text(), '0123456789', '') != '']]" $f > ${fn}_alt.xml; done | |
# 4. count them for each project | |
for f in $(find * -name "*.xml"); do count=$(grep "</tu>" $f | wc -l); echo "$f;$count"; done > alt_stats.csv | |
exit | |
# todo: filter out any alternative translations where both source and target text would be identical to the source and target text in the list of tu nodes of default translations | |
# one potential approach: | |
# - add a condition to the xpath expression in line 32: | |
# that counts the number of tu node that do not have prop's and that have a source text that equals the source text of the tu node being evaluated and that have a target text that equals the target text of the tu node being evaluated, and does not extract the tu node being evaluated if that count is not 0 | |
# another potential approach: | |
# 1. get all the tu nodes with default translations, e.g. | |
for f in $(find * -name "*.tmx"); do fn=$(basename $f .tmx); ls -ls $f; if [ ! -s $f ]; then continue; fi; xmlstarlet select -t -c "//tu[not(prop)]" $f > ${fn}_def.xml; done | |
# 2. filter out any alternative translations where both source and target text would be identical to the source and target text in the list of tu nodes of default translations | |
############################################################ | |
Some of the extract working TMs are empty, because there's no omegat/project_save.tmx found in those packages. That seems to be because they were never unpacked (or they were unpacked but not re-packed, therefore the translations are in the source (bilingual) XLIFF files but OmegaT never created a working TM in that project. | |
How to fix it: | |
- copy the OMT project to the local working TM and unpack it | |
- run OmegaT in console mode on the project to generate the master TM | |
- copy the master TM and extract the alternative translations from it | |
Alternatively: | |
- copy the OMT project to the local working TM and unpack it | |
- run OmegaT in console mode on the project with the script to export as single spreadsheet (including column for Alt/Uniq) | |
- copy the excel/tsv export and extract the alternative translations from it |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment