Created
December 6, 2016 17:32
-
-
Save haruo31/013da99979f8f8e85593af71801dc2a3 to your computer and use it in GitHub Desktop.
sample 2 project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ java -jar target/sample2-1.0-SNAPSHOT.jar NORMAL ../sample/mecab-ipadic-neologd-master/seed/neologd-adverb-dict-seed.20150623.csv.xz | |
Documents: 139792 | |
Finished: 3.584 s | |
$ java -jar target/sample2-1.0-SNAPSHOT.jar ABNORMAL_COLLECT ../sample/mecab-ipadic-neologd-master/seed/neologd-adverb-dict-seed.20150623.csv.xz | |
Documents: 139792 | |
Finished: 4.527 s | |
$ java -jar target/sample2-1.0-SNAPSHOT.jar NORMAL ../sample/mecab-ipadic-neologd-master/seed/neologd-adjective-exp-dict-seed.20151126.csv.xz | |
Documents: 1051146 | |
Finished: 14.65 s | |
$ java -jar target/sample2-1.0-SNAPSHOT.jar ABNORMAL_COLLECT ../sample/mecab-ipadic-neologd-master/seed/neologd-adjective-exp-dict-seed.20151126.csv.xz | |
Exception in thread "main" java.lang.OutOfMemoryError | |
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) | |
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) | |
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) | |
at java.lang.reflect.Constructor.newInstance(Constructor.java:423) | |
at java.util.concurrent.ForkJoinTask.getThrowableException(ForkJoinTask.java:598) | |
at java.util.concurrent.ForkJoinTask.reportException(ForkJoinTask.java:677) | |
at java.util.concurrent.ForkJoinTask.invoke(ForkJoinTask.java:735) | |
at java.util.stream.ReduceOps$ReduceOp.evaluateParallel(ReduceOps.java:714) | |
at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:233) | |
at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:510) | |
at jp.underthetree.SampleApp$Engine$2.create(SampleApp.java:63) | |
at jp.underthetree.SampleApp.main(SampleApp.java:24) | |
Caused by: java.lang.OutOfMemoryError: Java heap space | |
at org.apache.lucene.store.RAMFile.newBuffer(RAMFile.java:78) | |
at org.apache.lucene.store.RAMFile.addBuffer(RAMFile.java:51) | |
at org.apache.lucene.store.RAMOutputStream.switchCurrentBuffer(RAMOutputStream.java:164) | |
at org.apache.lucene.store.RAMOutputStream.writeBytes(RAMOutputStream.java:150) | |
at org.apache.lucene.store.DataOutput.copyBytes(DataOutput.java:278) | |
at org.apache.lucene.store.Directory.copyFrom(Directory.java:179) | |
at org.apache.lucene.store.LockValidatingDirectoryWrapper.copyFrom(LockValidatingDirectoryWrapper.java:50) | |
at org.apache.lucene.index.IndexWriter.copySegmentAsIs(IndexWriter.java:2886) | |
at org.apache.lucene.index.IndexWriter.addIndexes(IndexWriter.java:2660) | |
at jp.underthetree.SampleApp$Engine$2.lambda$create$3(SampleApp.java:79) | |
at jp.underthetree.SampleApp$Engine$2$$Lambda$4/1128032093.accept(Unknown Source) | |
at java.util.stream.ReduceOps$4ReducingSink.combine(ReduceOps.java:225) | |
at java.util.stream.ReduceOps$4ReducingSink.combine(ReduceOps.java:211) | |
at java.util.stream.ReduceOps$ReduceTask.onCompletion(ReduceOps.java:754) | |
at java.util.concurrent.CountedCompleter.tryComplete(CountedCompleter.java:577) | |
at java.util.stream.AbstractTask.compute(AbstractTask.java:317) | |
at java.util.concurrent.CountedCompleter.exec(CountedCompleter.java:731) | |
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) | |
at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) | |
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) | |
at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:157) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
<modelVersion>4.0.0</modelVersion> | |
<groupId>jp.underthetree</groupId> | |
<artifactId>sample2</artifactId> | |
<version>1.0-SNAPSHOT</version> | |
<packaging>jar</packaging> | |
<name>sample2</name> | |
<url>http://maven.apache.org</url> | |
<build> | |
<sourceDirectory>src</sourceDirectory> | |
<plugins> | |
<plugin> | |
<artifactId>maven-compiler-plugin</artifactId> | |
<version>3.5.1</version> | |
<configuration> | |
<source>1.8</source> | |
<target>1.8</target> | |
</configuration> | |
</plugin> | |
<plugin> | |
<groupId>org.apache.maven.plugins</groupId> | |
<artifactId>maven-shade-plugin</artifactId> | |
<version>2.4.3</version> | |
<executions> | |
<execution> | |
<phase>package</phase> | |
<goals> | |
<goal>shade</goal> | |
</goals> | |
<configuration> | |
<transformers> | |
<transformer | |
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> | |
<mainClass>jp.underthetree.Sample2Main</mainClass> | |
</transformer> | |
</transformers> | |
</configuration> | |
</execution> | |
</executions> | |
</plugin> | |
</plugins> | |
</build> | |
<properties> | |
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |
</properties> | |
<dependencies> | |
<dependency> | |
<groupId>org.tukaani</groupId> | |
<artifactId>xz</artifactId> | |
<version>1.6</version> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.lucene</groupId> | |
<artifactId>lucene-core</artifactId> | |
<version>6.3.0</version> | |
</dependency> | |
<dependency> | |
<groupId>com.google.guava</groupId> | |
<artifactId>guava</artifactId> | |
<version>20.0</version> | |
</dependency> | |
</dependencies> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package jp.underthetree; | |
import java.io.*; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field.Store; | |
import org.apache.lucene.document.StringField; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.RAMDirectory; | |
import org.tukaani.xz.XZInputStream; | |
import com.google.common.base.Stopwatch; | |
public class Sample2Main { | |
public static void main(String[] args) throws Exception { | |
Engine engine = Engine.valueOf(args[0]); | |
File input = new File(args[1]); | |
Stopwatch sw = Stopwatch.createStarted(); | |
try (InputStream sin = new XZInputStream(new FileInputStream(input))) { | |
DirectoryReader reader = engine.create(sin); | |
System.out.println("Documents: " + reader.maxDoc()); | |
} | |
System.out.println("Finished: " + sw); | |
} | |
static enum Engine { | |
NORMAL { | |
@Override | |
DirectoryReader create(InputStream in) { | |
Directory dir = new RAMDirectory(); | |
try { | |
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig()); | |
new BufferedReader(new InputStreamReader(in)).lines().forEach(s -> { | |
Document doc = new Document(); | |
doc.add(new StringField("row", s, Store.YES)); | |
try { | |
writer.addDocument(doc); | |
} catch (IOException e) { | |
throw new UncheckedIOException(e); | |
} | |
}); | |
writer.commit(); | |
writer.close(); | |
return DirectoryReader.open(dir); | |
} catch (IOException e) { | |
throw new UncheckedIOException(e); | |
} | |
} | |
}, | |
ABNORMAL_COLLECT { | |
@Override | |
DirectoryReader create(InputStream in) { | |
try { | |
IndexWriter w = new BufferedReader(new InputStreamReader(in)).lines().parallel() | |
.unordered().map(s -> { | |
Document doc = new Document(); | |
doc.add(new StringField("row", s, Store.YES)); | |
return doc; | |
}).collect(() -> { | |
try { | |
return new IndexWriter(new RAMDirectory(), new IndexWriterConfig()); | |
} catch (IOException e) { | |
throw new UncheckedIOException(e); | |
} | |
}, (wri, rec) -> { | |
try { | |
wri.addDocument(rec); | |
} catch (IOException e) { | |
throw new UncheckedIOException(e); | |
} | |
}, (wri1, wri2) -> { | |
try { | |
wri2.commit(); | |
wri2.close(); | |
wri1.addIndexes(wri2.getDirectory()); | |
} catch (IOException e) { | |
throw new UncheckedIOException(e); | |
} | |
}); | |
w.commit(); | |
w.close(); | |
return DirectoryReader.open(w.getDirectory()); | |
} catch (IOException e) { | |
throw new UncheckedIOException(e); | |
} | |
} | |
}; | |
abstract DirectoryReader create(InputStream in); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment