Skip to content

Instantly share code, notes, and snippets.

@k8si
k8si / rust_vs_python_tokenizers.py
Last active Sep 30, 2020
differences in rust vs. python tokenizer behavior
View rust_vs_python_tokenizers.py
import logging
import traceback
from copy import deepcopy
from pathlib import Path
from transformers import PreTrainedTokenizer
from transformers.data.processors.squad import SquadV2Processor, SquadExample
from transformers.tokenization_bert import BertTokenizer
from transformers.tokenization_bert import BertTokenizerFast
from transformers.tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
View gist:8595b4460a3e11802dba
class TestCategoricalDomain extends JUnitSuite with cc.factorie.util.FastLogging {
@Test
def testPlusEquals(): Unit = {
val domain = new CategoricalDomain[String](List("yes", "no"))
domain.freeze()
printDomainInfo(domain, "init")
/*
init:
View gist:159be8027acf99c74b46
import java.io._
import cc.factorie.app.nlp.Document
object TestStuff {
def serializeStuff(): Unit = {
class Thing(s: String) extends Serializable {
override def toString: String = s"Thing($s)"
}
View gist:b75e8572c7fe33146a28
import java.io._
object TestStuff {
class Person(val name: String) extends Serializable {
var age: Int = 0
object personProperties extends Serializable {
val location: String = "MA"
var job: String = ""
}
View gist:ae0409929544f032d498
def createParagraph2(paragraphNode: Node, paragraphStart: Int, doc: Document): Unit = {
for (child <- paragraphNode.childNodes) {
if (child.isInstanceOf[TextNode]) {
val tmpDoc = new Document(child.asInstanceOf[TextNode].text)
cc.factorie.app.nlp.segment.DeterministicNormalizingTokenizer.process(tmpDoc)
//attach the tokens to the original document
tmpDoc.tokens.foreach { token => new Token(doc, token.string) }
} else if (child.nodeName.equals("a")) {
val linkTarget: String = child.attr("href")
val linkText: String = child.childNode(0).toString()
@k8si
k8si / process.sh
Created May 20, 2015
Grobid script
View process.sh
#!/bin/bash
memory="1024m"
jarfile="/home/kate/research/myproject/grobid/grobid-core/target/grobid-core-0.3.4-SNAPSHOT.one-jar.jar"
grobidHome="/home/kate/research/myproject/grobid/grobid-home"
config="/home/kate/research/myproject/grobid/grobid-home/config/grobid.properties"
input="/home/kate/research/myproject/pdfs"
output="/home/kate/research/myproject/output"
java -Xmx$memory -jar $jarfile \
@k8si
k8si / pyc_links.md
Last active May 9, 2021
A list of links explaining the .pyc file format and other stuff
View pyc_links.md