Skip to content

Instantly share code, notes, and snippets.

@haruo31
Created December 6, 2016 17:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save haruo31/013da99979f8f8e85593af71801dc2a3 to your computer and use it in GitHub Desktop.
Save haruo31/013da99979f8f8e85593af71801dc2a3 to your computer and use it in GitHub Desktop.
sample 2 project
$ java -jar target/sample2-1.0-SNAPSHOT.jar NORMAL ../sample/mecab-ipadic-neologd-master/seed/neologd-adverb-dict-seed.20150623.csv.xz
Documents: 139792
Finished: 3.584 s
$ java -jar target/sample2-1.0-SNAPSHOT.jar ABNORMAL_COLLECT ../sample/mecab-ipadic-neologd-master/seed/neologd-adverb-dict-seed.20150623.csv.xz
Documents: 139792
Finished: 4.527 s
$ java -jar target/sample2-1.0-SNAPSHOT.jar NORMAL ../sample/mecab-ipadic-neologd-master/seed/neologd-adjective-exp-dict-seed.20151126.csv.xz
Documents: 1051146
Finished: 14.65 s
$ java -jar target/sample2-1.0-SNAPSHOT.jar ABNORMAL_COLLECT ../sample/mecab-ipadic-neologd-master/seed/neologd-adjective-exp-dict-seed.20151126.csv.xz
Exception in thread "main" java.lang.OutOfMemoryError
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at java.util.concurrent.ForkJoinTask.getThrowableException(ForkJoinTask.java:598)
at java.util.concurrent.ForkJoinTask.reportException(ForkJoinTask.java:677)
at java.util.concurrent.ForkJoinTask.invoke(ForkJoinTask.java:735)
at java.util.stream.ReduceOps$ReduceOp.evaluateParallel(ReduceOps.java:714)
at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:233)
at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:510)
at jp.underthetree.SampleApp$Engine$2.create(SampleApp.java:63)
at jp.underthetree.SampleApp.main(SampleApp.java:24)
Caused by: java.lang.OutOfMemoryError: Java heap space
at org.apache.lucene.store.RAMFile.newBuffer(RAMFile.java:78)
at org.apache.lucene.store.RAMFile.addBuffer(RAMFile.java:51)
at org.apache.lucene.store.RAMOutputStream.switchCurrentBuffer(RAMOutputStream.java:164)
at org.apache.lucene.store.RAMOutputStream.writeBytes(RAMOutputStream.java:150)
at org.apache.lucene.store.DataOutput.copyBytes(DataOutput.java:278)
at org.apache.lucene.store.Directory.copyFrom(Directory.java:179)
at org.apache.lucene.store.LockValidatingDirectoryWrapper.copyFrom(LockValidatingDirectoryWrapper.java:50)
at org.apache.lucene.index.IndexWriter.copySegmentAsIs(IndexWriter.java:2886)
at org.apache.lucene.index.IndexWriter.addIndexes(IndexWriter.java:2660)
at jp.underthetree.SampleApp$Engine$2.lambda$create$3(SampleApp.java:79)
at jp.underthetree.SampleApp$Engine$2$$Lambda$4/1128032093.accept(Unknown Source)
at java.util.stream.ReduceOps$4ReducingSink.combine(ReduceOps.java:225)
at java.util.stream.ReduceOps$4ReducingSink.combine(ReduceOps.java:211)
at java.util.stream.ReduceOps$ReduceTask.onCompletion(ReduceOps.java:754)
at java.util.concurrent.CountedCompleter.tryComplete(CountedCompleter.java:577)
at java.util.stream.AbstractTask.compute(AbstractTask.java:317)
at java.util.concurrent.CountedCompleter.exec(CountedCompleter.java:731)
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:157)
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>jp.underthetree</groupId>
<artifactId>sample2</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>sample2</name>
<url>http://maven.apache.org</url>
<build>
<sourceDirectory>src</sourceDirectory>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>jp.underthetree.Sample2Main</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.tukaani</groupId>
<artifactId>xz</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>6.3.0</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>20.0</version>
</dependency>
</dependencies>
</project>
package jp.underthetree;
import java.io.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.tukaani.xz.XZInputStream;
import com.google.common.base.Stopwatch;
public class Sample2Main {
public static void main(String[] args) throws Exception {
Engine engine = Engine.valueOf(args[0]);
File input = new File(args[1]);
Stopwatch sw = Stopwatch.createStarted();
try (InputStream sin = new XZInputStream(new FileInputStream(input))) {
DirectoryReader reader = engine.create(sin);
System.out.println("Documents: " + reader.maxDoc());
}
System.out.println("Finished: " + sw);
}
static enum Engine {
NORMAL {
@Override
DirectoryReader create(InputStream in) {
Directory dir = new RAMDirectory();
try {
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
new BufferedReader(new InputStreamReader(in)).lines().forEach(s -> {
Document doc = new Document();
doc.add(new StringField("row", s, Store.YES));
try {
writer.addDocument(doc);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
});
writer.commit();
writer.close();
return DirectoryReader.open(dir);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
},
ABNORMAL_COLLECT {
@Override
DirectoryReader create(InputStream in) {
try {
IndexWriter w = new BufferedReader(new InputStreamReader(in)).lines().parallel()
.unordered().map(s -> {
Document doc = new Document();
doc.add(new StringField("row", s, Store.YES));
return doc;
}).collect(() -> {
try {
return new IndexWriter(new RAMDirectory(), new IndexWriterConfig());
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}, (wri, rec) -> {
try {
wri.addDocument(rec);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}, (wri1, wri2) -> {
try {
wri2.commit();
wri2.close();
wri1.addIndexes(wri2.getDirectory());
} catch (IOException e) {
throw new UncheckedIOException(e);
}
});
w.commit();
w.close();
return DirectoryReader.open(w.getDirectory());
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
};
abstract DirectoryReader create(InputStream in);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment