Skip to content

Instantly share code, notes, and snippets.

View luistung's full-sized avatar
🎯
Focusing

luistung

🎯
Focusing
View GitHub Profile
@luistung
luistung / split_by_field.py
Last active January 1, 2016 07:19
按第一字段切分文件
#!/bin/env python
#usage argv[0] file_tobe_split target_dir
#按第一字段切分文件
import sys
src_file = sys.argv[1]
tgt_dir = sys.argv[2]
@luistung
luistung / swap
Created April 28, 2014 13:22
交换两个文件
#!/bin/sh
if [[ $# != 2 && $# != 3 ]]
then
echo '需要至少两个参数' >&2
echo 'usage:'
echo "$(basename $0) file1 file2 tmpdir" >&2
exit 1
fi
@luistung
luistung / counter
Created May 22, 2014 07:18
python全局计数器
def counter(interval):
if not hasattr(counter,'count'):
counter.count = 0
counter.count += 1
import sys
if counter.count % interval == 0: print >>sys.stderr, counter.count
@luistung
luistung / sort.py
Last active August 29, 2015 14:01
文件排序
import sys
lines = []
for line in sys.stdin:
line = line.rstrip()
tokens = line.split('\t')
src, tgt = tokens[0], tokens[1]
lines.append(((src,tgt), line)) #TODO
lines.sort(key=lambda a:a[0])
@luistung
luistung / merge.py
Created May 27, 2014 04:57
多路归并
import os
import sys
import heapq
fileList = [open(sys.argv[1] + '/' + f) for f in os.listdir(sys.argv[1])]
def lineIterator(f):
for line in f:
line = line.rstrip()
tokens = line.split('\t')
@luistung
luistung / SConstruct
Last active August 29, 2015 14:03
scons示例
env = Environment(CPPFLAGS=["-std=gnu++0x", "-Wall", "-O0"])
env.SharedLibrary('foo', ['f1.cpp'])
env.Program("test.cpp", LIBS=["boost_regex", "boost_locale", "foo"], LIBPATH='.')
@luistung
luistung / httpserver.py
Created July 29, 2014 08:54
python简单http服务器
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
class TestHTTPHandler(BaseHTTPRequestHandler):
def do_GET(self):
self.protocal_version = 'HTTP/1.1'
self.send_response(200)
self.send_header("Welcome", "Contect")
self.end_headers()
self.wfile.write('hello world')
http_server = HTTPServer(('0.0.0.0', 7778), TestHTTPHandler)
@luistung
luistung / .vimrc
Last active April 24, 2019 03:31
vim config
syntax on
colorscheme desert
set number
set cindent
set tabstop=4
set shiftwidth=4
set expandtab
set termencoding=utf-8
set fileencodings=utf-8
set encoding=utf-8
@luistung
luistung / tokenization.cpp
Created October 11, 2019 12:02
c++ version of bert tokenize
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_map>
#include <boost/algorithm/string.hpp>
#include <utf8proc.h>
//https://unicode.org/reports/tr15/#Norm_Forms
//https://ssl.icu-project.org/apiref/icu4c/uchar_8h.html
@luistung
luistung / tokenization.cpp
Last active May 30, 2024 03:15
c++ version of bert tokenize
/* c++ version of tokenization for bert
Copyright (C) 2019 luistung
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of