msoutopico/get_omt_alt_data.sh

## get_omt_alt_data.sh
#!/usr/bin/env bash

# steps:
# 03_VERIF_from_Ver
# 04_VERIF_rev_del
# 05_FC_IN
# 06_FC_to_VER
# 07_FC_from_VER
# 08_FC_rev_del
# 09_POST-FC_requests
# 10_POST-SDS
# 10_POST-SDS_request

# 1. extract project_save and name it as the project
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/05_FC_IN/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/06_FC_to_VER/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/07_FC_from_VER/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/08_FC_rev_del/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/09_POST-FC_requests/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/10_POST-SDS/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/10_POST-SDS_requests/.+PISA2022MS_OMT_(SCI|REA|MAT(New|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" | grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done

# cd working dir
cd /home/manuel/pisa25/tred-transfer-issues/working_tms

# 2. remove doctype which xmlstarlet chokes at
grep -rl 'DOCTYPE' *.tmx | xargs sed -i '/DOCTYPE/d'

# 3. extract alt tu nodes
for f in $(find * -name "*.tmx"); do fn=$(basename $f .tmx); ls -ls $f; if [ ! -s $f ]; then continue; fi; xmlstarlet select -t -c "//tu[prop[contains(@type, 'id')] and tuv[@lang='en-ZZ']/seg/text()!=tuv[@lang!='en-ZZ']/seg/text() and tuv[@lang='en-ZZ']/seg[translate(text(), '0123456789', '') != '']]" $f > ${fn}_alt.xml; done

# 4. count them for each project
for f in $(find * -name "*.xml"); do count=$(grep "</tu>" $f | wc -l); echo "$f;$count"; done > alt_stats.csv

exit

# todo: filter out any alternative translations where both source and target text would be identical to the source and target text in the list of tu nodes of default translations

# one potential approach:
# - add a condition to the xpath expression in line 32:
# that counts the number of tu node that do not have prop's and that have a source text that equals the source text of the tu node being evaluated and that have a target text that equals the target text of the tu node being evaluated, and does not extract the tu node being evaluated if that count is not 0

# another potential approach:
# 1. get all the tu nodes with default translations, e.g.

for f in $(find * -name "*.tmx"); do fn=$(basename $f .tmx); ls -ls $f; if [ ! -s $f ]; then continue; fi; xmlstarlet select -t -c "//tu[not(prop)]" $f > ${fn}_def.xml; done

# 2. filter out any alternative translations where both source and target text would be identical to the source and target text in the list of tu nodes of default translations

############################################################

Some of the extract working TMs are empty, because there's no omegat/project_save.tmx found in those packages. That seems to be because they were never unpacked (or they were unpacked but not re-packed, therefore the translations are in the source (bilingual) XLIFF files but OmegaT never created a working TM in that project.

How to fix it:
- copy the OMT project to the local working TM and unpack it
- run OmegaT in console mode on the project to generate the master TM
- copy the master TM and extract the alternative translations from it

Alternatively:
- copy the OMT project to the local working TM and unpack it
- run OmegaT in console mode on the project with the script to export as single spreadsheet (including column for Alt/Uniq)
- copy the excel/tsv export and extract the alternative translations from it
	#!/usr/bin/env bash

	# steps:
	# 03_VERIF_from_Ver
	# 04_VERIF_rev_del
	# 05_FC_IN
	# 06_FC_to_VER
	# 07_FC_from_VER
	# 08_FC_rev_del
	# 09_POST-FC_requests
	# 10_POST-SDS
	# 10_POST-SDS_request

	# 1. extract project_save and name it as the project
	for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/05_FC_IN/.+PISA2022MS_OMT_(SCI\|REA\|MAT(New\|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" \| grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
	for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/06_FC_to_VER/.+PISA2022MS_OMT_(SCI\|REA\|MAT(New\|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" \| grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
	for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/07_FC_from_VER/.+PISA2022MS_OMT_(SCI\|REA\|MAT(New\|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" \| grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
	for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/08_FC_rev_del/.+PISA2022MS_OMT_(SCI\|REA\|MAT(New\|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" \| grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
	for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/09_POST-FC_requests/.+PISA2022MS_OMT_(SCI\|REA\|MAT(New\|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" \| grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
	for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/10_POST-SDS/.+PISA2022MS_OMT_(SCI\|REA\|MAT(New\|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" \| grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done
	for f in $(find /media/data/data/company/PISA_2021/MAIN_SURVEY -regextype egrep -regex ".+/10_POST-SDS_requests/.+PISA2022MS_OMT_(SCI\|REA\|MAT(New\|Trend))_[a-z]{3}-[A-Z]{3}\.omt$" \| grep -iv "_Outdated"); do fn=$(basename $f .omt); echo $fn; unzip -p $f omegat/project_save.tmx >/home/manuel/pisa25/tred-transfer-issues/working_tms/$fn.tmx; done

	# cd working dir
	cd /home/manuel/pisa25/tred-transfer-issues/working_tms

	# 2. remove doctype which xmlstarlet chokes at
	grep -rl 'DOCTYPE' *.tmx \| xargs sed -i '/DOCTYPE/d'

	# 3. extract alt tu nodes
	for f in $(find * -name "*.tmx"); do fn=$(basename $f .tmx); ls -ls $f; if [ ! -s $f ]; then continue; fi; xmlstarlet select -t -c "//tu[prop[contains(@type, 'id')] and tuv[@lang='en-ZZ']/seg/text()!=tuv[@lang!='en-ZZ']/seg/text() and tuv[@lang='en-ZZ']/seg[translate(text(), '0123456789', '') != '']]" $f > ${fn}_alt.xml; done

	# 4. count them for each project
	for f in $(find * -name "*.xml"); do count=$(grep "</tu>" $f \| wc -l); echo "$f;$count"; done > alt_stats.csv

	exit

	# todo: filter out any alternative translations where both source and target text would be identical to the source and target text in the list of tu nodes of default translations

	# one potential approach:
	# - add a condition to the xpath expression in line 32:
	# that counts the number of tu node that do not have prop's and that have a source text that equals the source text of the tu node being evaluated and that have a target text that equals the target text of the tu node being evaluated, and does not extract the tu node being evaluated if that count is not 0

	# another potential approach:
	# 1. get all the tu nodes with default translations, e.g.

	for f in $(find * -name "*.tmx"); do fn=$(basename $f .tmx); ls -ls $f; if [ ! -s $f ]; then continue; fi; xmlstarlet select -t -c "//tu[not(prop)]" $f > ${fn}_def.xml; done

	# 2. filter out any alternative translations where both source and target text would be identical to the source and target text in the list of tu nodes of default translations

	############################################################

	Some of the extract working TMs are empty, because there's no omegat/project_save.tmx found in those packages. That seems to be because they were never unpacked (or they were unpacked but not re-packed, therefore the translations are in the source (bilingual) XLIFF files but OmegaT never created a working TM in that project.

	How to fix it:
	- copy the OMT project to the local working TM and unpack it
	- run OmegaT in console mode on the project to generate the master TM
	- copy the master TM and extract the alternative translations from it

	Alternatively:
	- copy the OMT project to the local working TM and unpack it
	- run OmegaT in console mode on the project with the script to export as single spreadsheet (including column for Alt/Uniq)
	- copy the excel/tsv export and extract the alternative translations from it