Last active
April 9, 2019 13:55
-
-
Save glowinthedark/b2adab05dd68d39330507a9d080b335a to your computer and use it in GitHub Desktop.
PoiWordToTextConverter — minimal POI-based word to text converter (DOC, DOCX)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# build the executable | |
mvn clean compile assembly:single | |
# the jar will be created in the `target` folder and can be run with | |
# java -jar text.doc text.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
<modelVersion>4.0.0</modelVersion> | |
<groupId>xh.wordconv</groupId> | |
<artifactId>wordconv</artifactId> | |
<version>1.0-SNAPSHOT</version> | |
<properties> | |
<maven.compiler.source>1.8</maven.compiler.source> | |
<maven.compiler.target>1.8</maven.compiler.target> | |
</properties> | |
<dependencies> | |
<dependency> | |
<groupId>org.apache.poi</groupId> | |
<artifactId>poi</artifactId> | |
<version>3.17</version> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.poi</groupId> | |
<artifactId>poi-ooxml</artifactId> | |
<version>3.17</version> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.poi</groupId> | |
<artifactId>poi-scratchpad</artifactId> | |
<version>3.17</version> | |
</dependency> | |
</dependencies> | |
<build> | |
<sourceDirectory>src</sourceDirectory> | |
<plugins> | |
<plugin> | |
<artifactId>maven-assembly-plugin</artifactId> | |
<configuration> | |
<source>1.8</source> | |
<target>1.8</target> | |
<archive> | |
<manifest> | |
<mainClass>WordToTextConverter</mainClass> | |
</manifest> | |
</archive> | |
<descriptorRefs> | |
<descriptorRef>jar-with-dependencies</descriptorRef> | |
</descriptorRefs> | |
</configuration> | |
</plugin> | |
</plugins> | |
</build> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// should go into the `src` folder relative to the `pom.xml` file | |
import java.io.FileInputStream; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import org.apache.poi.POITextExtractor; | |
import org.apache.poi.extractor.ExtractorFactory; | |
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |
import org.apache.xmlbeans.XmlException; | |
public class WordToTextConverter { | |
public static void main(String[] args) { | |
try { | |
convertWordToText(args[0], args[1]); | |
} catch (ArrayIndexOutOfBoundsException aiobe) { | |
System.out.println("Usage: java WordToTextConverter <word_file> <text_file>"); | |
} | |
} | |
public static void convertWordToText(String src, String desc) { | |
try { | |
FileInputStream fs = new FileInputStream(src); | |
final POITextExtractor extractor = ExtractorFactory.createExtractor(fs); | |
FileWriter fw = new FileWriter(desc); | |
fw.write(extractor.getText()); | |
fw.flush(); | |
fs.close(); | |
fw.close(); | |
} catch (IOException | OpenXML4JException | XmlException e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment