Skip to content

Instantly share code, notes, and snippets.

@yu-tang
Last active January 2, 2017 22:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yu-tang/6526991 to your computer and use it in GitHub Desktop.
Save yu-tang/6526991 to your computer and use it in GitHub Desktop.
OmegaT 用スクリプト。全分節の全参考訳文を列挙するサンプル。
// demonstrate how to get matches for each entry
// original concept from
// https://libretraduko.wordpress.com/2013/09/10/export-relavant-tus-from-legacy-tmx-files-in-omegat/
// by Kos Ivantsov 2013-09-10
// +
// Some hacks and refactoring
// https://gist.github.com/yu-tang/6526991
// by Yu-Tang
// +
// GUI independence and many improvments taken from
// http://pastebin.com/bukEe6Sb
// by cienislaw 2013-09-19
// +
// Add multithreading
// by Yu-Tang 2013-09-21
// +
// File selector taken from
// https://libretraduko.wordpress.com/2013/09/10/export-relavant-tus-from-legacy-tmx-files-in-omegat/
// by Kos Ivantsov + mod by Yu-Tang 2013-09-22
// ===================================================================
// Note: Currently this script doesn't care multiple-translations.
// If the project has segments with alternate translation,
// we did not ensure a outcome.
//################## config start ##################
SELECT_FILES = 'no' // 'yes' to specify file(s) for export via file chooser. else for whole project.
MIN_SIMILARITY = 50 // minimum matche similarity against score. 0-100. 0 means everything.
SIMILARITY_TYPE = 'adjustedScore' // ( score / scoreNoStem / adjustedScore)
//################## config end ##################
import groovy.swing.SwingBuilder
import groovy.time.TimeCategory
import groovy.time.TimeDuration
import groovy.transform.Synchronized
import org.omegat.core.Core
import org.omegat.core.data.IProject
import org.omegat.core.data.PrepareTMXEntry
import org.omegat.core.data.ProjectProperties
import org.omegat.core.data.SourceTextEntry
import org.omegat.core.data.TMXEntry
import org.omegat.core.matching.NearString
import org.omegat.gui.matches.FindMatchesThread
import org.omegat.gui.scripting.IScriptLogger
import org.omegat.util.TMXWriter2
import javax.swing.*
import javax.swing.filechooser.FileFilter
import javax.swing.filechooser.FileSystemView
import java.security.MessageDigest
import java.util.concurrent.atomic.AtomicInteger
class MatchesExportThread extends FindMatchesThread {
private Merger merger
private SourceTextEntry entry
private IProject project
public MatchesExportThread(final Merger merger, final IProject project, final SourceTextEntry entry) {
super(null, project, entry);
this.merger = merger
this.entry = entry
this.project = project
// super class gets null as matcherPane argument.
// So we need to override all methods which refer to matcherPane.
}
@Override
protected boolean isEntryChanged() {
//return currentlyProcessedEntry != pane.currentlyProcessedEntry;
return false
}
@Override
public void run() {
export()
merger.createThread()
merger.onThreadComplete()
}
private void export() {
// is this source already processed?
if (entry.duplicate != SourceTextEntry.DUPLICATE.NONE) {
if (! merger.addProcessedSource(entry.srcText)) {
return // skip already processed source
}
}
// is this segment already translated?
TMXEntry info = project.getTranslationInfo(entry)
if (info.isTranslated()) {
merger.writeEntry info
return
}
// search matches
try {
merger.setFoundResult search()
} catch (Exception ex) {
merger.setError ex, entry
}
}
}
/**
* Custom TMXWriter class
*
* <ul><li>TU countable.
* <li>Unique by source and translation pair. Not allow to duplicate TUs.
* </ul>
*/
class HashedTMXWriter extends TMXWriter2 {
private MessageDigest md = MessageDigest.getInstance('SHA-256')
private HashSet<ByteArrayWrapper> set = new HashSet<ByteArrayWrapper>()
private int similarity
private String similarityType
private int fuzzyTUCount = 0
/**
*
* @param file
* @param props ProjectProperties
* @param similarity
* @param similarityType
*/
public HashedTMXWriter(File file, ProjectProperties props, int similarity, String similarityType) {
super(file,
props.sourceLanguage,
props.targetLanguage,
props.isSentenceSegmentingEnabled(),
true, // levelTwo
true) // forceValidTMX
this.similarity = similarity
this.similarityType = similarityType
}
/**
* Write one entry.
*
* @param entry TMXEntry
*/
public void writeEntry(TMXEntry entry, boolean isFuzzy = false) {
ByteArrayWrapper wrap = getByteArrayWrapper(entry)
if (set.add(wrap)) {
writeEntry entry.source, entry.translation, entry, null
if (isFuzzy)
fuzzyTUCount++
}
}
/**
* Write one entry.
*
* @param match NearString
*/
public void writeEntry(NearString match) {
// filtering with simirality
if (match.scores[0][similarityType] < similarity) {
return
}
TMXEntry entry = new TMXEntry(
new PrepareTMXEntry(
source: match.source,
translation: match.translation,
changer: match.changer,
changeDate: match.changedDate,
creator: match.creator,
creationDate: match.creationDate,
note: null,
otherProperties: match.props
),
true, // defaultTranslation
null) // ExternalLinked
writeEntry entry, true
}
public int getTUCount() {
set.size()
}
public int getTranslatedTUCount() {
getTUCount() - getFuzzyTUCount()
}
public int getFuzzyTUCount() {
fuzzyTUCount
}
private byte[] getHash(String message) {
md.digest message.getBytes('UTF-8')
}
private ByteArrayWrapper getByteArrayWrapper(TMXEntry entry) {
byte[] bytes = (entry.source + '\0' + entry.translation).bytes
new ByteArrayWrapper(bytes)
}
// inner class - byte array wrapper for appropreate equality
private final class ByteArrayWrapper {
private final byte[] data
public ByteArrayWrapper(byte[] data) {
if (! data) {
throw new NullPointerException()
}
this.data = data
}
@Override
public boolean equals(Object other) {
if (! (other instanceof ByteArrayWrapper)) {
return false
}
Arrays.equals data, ((ByteArrayWrapper)other).data
}
@Override
public int hashCode() {
return Arrays.hashCode(data)
}
}
}
/**
* Collect matches, filtering and write to the file.
*
*/
class Merger extends Thread {
private List<SourceTextEntry> entries
private File fileTMX
private HashedTMXWriter writer
private int maxOfThreads
private IProject project
private IScriptLogger console
private int sizeOfEntries
private final AtomicInteger currentEntry = new AtomicInteger()
private final AtomicInteger runners = new AtomicInteger()
private failed = 0
private final Date timeStart = new Date()
private final Set<String> processedSources = Collections.synchronizedSet(new HashSet<String>())
@Override
public synchronized void start() {
// get rid of autosave during processing
Core.autoSave.disable()
sizeOfEntries = entries.size()
// create matches search threads
int cnt = maxOfThreads
while (cnt-- && createThread()) ;
}
public void setError(final Exception error, final SourceTextEntry entry) {
failed++
new SwingBuilder().doLater() {
console.println "Error on seg. #${entry.entryNum()}: ${entry.srcText}"
console.println error
}
}
/**
* Write TUs from matches.
*
* @param entry TMXEntry
*/
@Synchronized("writer")
public void setFoundResult(final List<NearString> matches) {
matches.each writer.&writeEntry
}
/**
* Write one TU from translated segment.
*
* @param entry TMXEntry
*/
@Synchronized("writer")
public void writeEntry(TMXEntry entry) {
writer.writeEntry entry
}
/**
* create new MatchesExportThread thread
*
* @return Returns true if new thread was created and false otherwise.
*/
public boolean createThread() {
int index = currentEntry.getAndIncrement()
boolean createMore = index < sizeOfEntries
if (createMore) {
new MatchesExportThread(this, project, entries[index]).start()
runners.incrementAndGet()
}
createMore
}
/**
* Adds the source string to HashSet if it is not already present.
*
* @param source String
* @return true if this set did not already contain the specified element
*/
public boolean addProcessedSource(String source) {
processedSources.add(source)
}
/* called when each thread completed */
public void onThreadComplete() {
if (runners.decrementAndGet() == 0) {
onExit()
}
}
/* called when all threads completed */
private void onExit() {
writer.close()
// delete TMX file when it has no TU
if (writer.TUCount == 0) {
def folder = fileTMX.parentFile
fileTMX.delete()
if (folder.list().size() == 0)
folder.delete() // delete empty folder too
}
// output summary
consolePrintln getSummary()
// restore AutoSave
Core.autoSave.enable()
}
private void consolePrintln(Object... args) {
new SwingBuilder().doLater() {args.each console.&println}
}
private String getSummary() {
TimeDuration td = TimeCategory.minus( new Date(), this.timeStart )
"""Exported ${writer.TUCount} TUs (translated ${writer.translatedTUCount} + fuzzy ${writer.fuzzyTUCount}).\
${failed ? ' failed ' + failed + '.' : ''}
It took ${td.hours ? td.hours + ' h ' : ''}${td.minutes} min ${td.seconds} sec ${td.millis} ms"""
}
}
/**
* FileSystemView for restricted browse only under the specified directory
*
*/
public class DirectoryRestrictedFileSystemView extends FileSystemView {
private File rootDirectory
@Override
public boolean isRoot(File f) {
return rootDirectory.equals(f)
}
@Override
public File[] getRoots() {
return [rootDirectory] as File[]
}
@Override
public File getHomeDirectory() {
return rootDirectory
}
@Override
public File getParentDirectory(File dir) {
return super.getParentDirectory(dir)
}
@Override
protected File createFileSystemRoot(File f) {
throw new UnsupportedOperationException("Not supported yet.")
}
@Override
public File createNewFolder(File containingDir) throws IOException {
throw new UnsupportedOperationException("Not supported yet.")
}
}
/**
* WhiteList based FileFilter
*
*/
public class WhiteListFilter extends FileFilter{
private List<File> whiteList
public boolean accept(File f){
// the file filter must show also directories, in order to be able to look into them
f.isDirectory() || whiteList.contains(f)
}
public String getDescription(){
return "OmegaT Source files";
}
}
File getDestTmxFile() {
def folder = new File(project.projectProperties.projectRoot, 'tmx_export')
if (! folder.exists())
folder.mkdir()
new File(folder, 'exported_relevant.tmx')
}
List<SourceTextEntry> getSelectedFilesEentries() {
String sourceRoot = project.projectProperties.sourceRoot
File rootDir = new File(sourceRoot)
List<SourceTextEntry> entries = new ArrayList<SourceTextEntry>()
def projectFiles = project.projectFiles
JFileChooser fc = new JFileChooser(
rootDir,
new DirectoryRestrictedFileSystemView(rootDirectory: rootDir))
fc.acceptAllFileFilterUsed = false
fc.addChoosableFileFilter new WhiteListFilter(
whiteList: projectFiles.collect() { new File(rootDir, it.filePath) })
fc.dialogTitle = 'Choose files to export'
fc.fileSelectionMode = JFileChooser.FILES_ONLY
fc.multiSelectionEnabled = true
if (fc.showOpenDialog(mainWindow.applicationFrame) != JFileChooser.APPROVE_OPTION) {
console.println 'Canceled'
return entries
}
def files = fc.selectedFiles
def lenSourceDirPath = sourceRoot.size()
console.println "Choosed ${files.size()} file(s)."
files.each() { file ->
def fi = projectFiles.find() {
it.filePath == file.canonicalPath[lenSourceDirPath..-1] }
entries += fi.entries
}
entries
}
//===========================================
// main flow
//===========================================
// check prerequisite
if (! project.isProjectLoaded()) {
console.println "no project found."
return
}
def entries = SELECT_FILES == 'yes' ?
getSelectedFilesEentries() :
project.allEntries
// abort when no entry
if (entries.size() == 0) {
console.println "no entry found."
return
}
File tmx = getDestTmxFile()
HashedTMXWriter writer = new HashedTMXWriter(tmx,
project.projectProperties,
MIN_SIMILARITY,
SIMILARITY_TYPE)
writer.writeComment " Default translations "
int processors = Runtime.runtime.availableProcessors()
new Merger(entries: entries,
fileTMX: tmx,
writer: writer,
maxOfThreads: processors,
project: project,
console: console)
.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment