Skip to content

Instantly share code, notes, and snippets.

@kishida
Last active April 8, 2023 02:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kishida/0899fed6c7b504f767ad2020f896ea40 to your computer and use it in GitHub Desktop.
Save kishida/0899fed6c7b504f767ad2020f896ea40 to your computer and use it in GitHub Desktop.
圧縮きしだのHatena
package neoki.slm;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.function.Consumer;
public class HatenaReader {
static class Header{ String baseName; String image; String title;
String date; boolean published;}
public record BlogEntry (
String title, String baseName, String image, String date, boolean published,
String body, String stripedBody) { }
static void read(Path path, Consumer<BlogEntry> entryConsumer) throws IOException {
enum Part {HEADER, CONTENT, BODY, COMMENT}
try (var bur = Files.newBufferedReader(path)) {
Part p = Part.HEADER;
int docCount = 0;
StringBuilder body = new StringBuilder();
StringBuilder striped = new StringBuilder();
Header h = new Header();
for (String line; (line = bur.readLine()) != null; ) {
switch (p) {
case HEADER -> {
if (line.startsWith("BASENAME")) {
h.baseName = line.substring("BASENAME: ".length());
} else if (line.startsWith("IMAGE")) {
h.image = line.substring("IMAGE: ".length());
} else if (line.startsWith("TITLE")) {
h.title = line.substring("TITLE: ".length());
} else if (line.startsWith("DATE")) {
h.date = line.substring("DATE: ".length());
} else if (line.equals("STATUS: Publish")) {
h.published = true;
} else if (line.equals("-----")) {
p = Part.CONTENT;
}
}
case CONTENT -> {
p = Part.BODY;
body.setLength(0);
striped.setLength(0);
++docCount;
}
case BODY -> {
if (line.equals("-----")) {
p = Part.COMMENT;
} else {
var s = line.replaceAll("<[a-z/][^>]*>", "");
striped.append(s).append("\n");
body.append(line).append("\n");
}
}
case COMMENT -> {
if (h.baseName == null || h.date == null || h.title == null) {
// System.out.println("something wrong!!");
return;
}
if (line.equals("--------")) {
var text = striped.toString();
BlogEntry ent = new BlogEntry(
h.title, h.baseName, h.image, h.date, h.published,
body.toString(), text);
entryConsumer.accept(ent);
p = Part.HEADER;
h = new Header();
}
}
}
}
}
}
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>naoki</groupId>
<artifactId>SmallLanguageModel</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>com.atilika.kuromoji</groupId>
<artifactId>kuromoji-ipadic</artifactId>
<version>0.9.0</version>
</dependency>
</dependencies>
<properties>
<maven.compiler.source>20</maven.compiler.source>
<maven.compiler.target>20</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
</project>
package neoki.slm;
import com.atilika.kuromoji.TokenBase;
import com.atilika.kuromoji.ipadic.Token;
import com.atilika.kuromoji.ipadic.Tokenizer;
import javax.swing.*;
import java.awt.*;
import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
public class TokenReader {
static class Morph {
String surface;
TokenBase token;
Map<Morph, Integer> successor = new HashMap<>();
int successorCount = 0;
Morph(String surface, TokenBase token) {
this.surface = surface;
this.token = token;
}
void addSuccessor(Morph m) {
successor.compute(m, (k, v) -> v == null ? 1 : v + 1);
++successorCount;
}
}
static Map<String, Morph> morphs = new HashMap<>();
static final Morph START = new Morph(null, null);
public static void main(String[] args) throws IOException {
System.out.println("Hello, World!");
var tokenizer = new Tokenizer();
HatenaReader.read(Path.of("D:/dev/nowokay.hatenablog.com.export.txt"),entry -> {
System.out.println("--\n" + entry.title());
entry.stripedBody().lines().forEach(line -> {
var current = START;
line = line.trim();
if (line.isEmpty() || line.charAt(0) < 0x80) return;
List<Token> tokens = tokenizer.tokenize(line);
for (var token : tokens) {
// System.out.println(token.getSurface() + "\t" + token.getAllFeatures());
var m = morphs.computeIfAbsent(token.getSurface(), s -> new Morph(s, token));
current.addSuccessor(m);
if (m.token.getAllFeatures().contains("句点")) {
m.addSuccessor(START);
current = START;
} else {
current = m;
}
}
if (current != START) {
current.addSuccessor(START);
}
});
});
Random r = new Random();
var f = new JFrame("小規模言語モデル");
var p = new JPanel();
f.add(BorderLayout.NORTH, p);
var b = new JButton("生成");
p.add(b);
var ta = new JTextArea();
ta.setLineWrap(true);
var sc = new JScrollPane(ta);
f.add(sc);
f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
f.setSize(800, 600);
f.setVisible(true);
b.addActionListener(evt -> {
var current = START;
for (int i = 0; i < 100; ++i) {
int n = r.nextInt(current.successorCount);
for (var e : current.successor.entrySet()) {
n -= e.getValue();
if (n < 0) {
current = e.getKey();
break;
}
}
if (current == START) {
ta.append("\n\n");
return;
}
ta.append(current.surface);
}
ta.append("\n\n");
});
}
}
@kishida
Copy link
Author

kishida commented Apr 8, 2023

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment