Duy Do duydo

## ByteTokenizer.java
/**
 * @(#)ByteTokenizer.java Sep 23, 2008
 * Copyright (C) 2008 Duy Do. All Rights Reserved.
 */
package com.duydo.util;

import java.util.Enumeration;
import java.util.NoSuchElementException;

/**

## bot.rb
# Use this script to test that your Telegram bot works.
#
# Install the dependency
#
#   $ gem install telegram_bot
#
# Run the bot
#
#   $ ruby bot.rb
#

## elasticsearch_best_practices.txt
If you want, I can try and help with pointers as to how to improve the indexing speed you get. Its quite easy to really increase it by using some simple guidelines, for example:

- Use create in the index API (assuming you can).
- Relax the real time aspect from 1 second to something a bit higher (index.engine.robin.refresh_interval).
- Increase the indexing buffer size (indices.memory.index_buffer_size), it defaults to the value 10% which is 10% of the heap.
- Increase the number of dirty operations that trigger automatic flush (so the translog won't get really big, even though its FS based) by setting index.translog.flush_threshold (defaults to 5000).
- Increase the memory allocated to elasticsearch node. By default its 1g.
- Start with a lower replica count (even 0), and then once the bulk loading is done, increate it to the value you want it to be using the update_settings API. This will improve things as possibly less shards will be allocated to each machine.
- Increase the number of machines you have so

## proc_net_tcp_decode
Decoding the data in /proc/net/tcp:

Linux 5.x  /proc/net/tcp
Linux 6.x  /proc/PID/net/tcp

Given a socket:

$ ls -l  /proc/24784/fd/11
lrwx------ 1 jkstill dba 64 Dec  4 16:22 /proc/24784/fd/11 -> socket:[15907701]

## elasticsearch.yml
##################################################################
# /etc/elasticsearch/elasticsearch.yml
#
# Base configuration for a write heavy cluster
#

# Cluster / Node Basics
cluster.name: logng

# Node can have abritrary attributes we can use for routing

## gist:9270e6e9ac326184dab5b9b11ecde2e3
import operator
from pprint import pprint

def is_dict(d):
    return isinstance(d, dict)

def get(c, k, default=None):
    try:
        return c[k]
    except (IndexError, KeyError, TypeError):

## gist:9587121
    {
        "analysis": {
            "filter": {
                "ar_stop_filter": {
                    "type": "stop",
                    "stopwords": ["_arabic_"]
                },
                "bg_stop_filter": {
                    "type": "stop",
                    "stopwords": ["_bulgarian_"]

## gist:ac4358ec3bddcaba02cf347369923674
PUT test_index
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0,
    "analysis": {
      "analyzer": {
        "test_analyzer": {
          "type":"custom",
          "tokenizer": "whitespace",

## twitter_mapping.sh
curl -XPUT 'http://localhost:9200/twitter' -d '{
    "settings" : {
        "index" : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        },
        "analysis" : {
            "filter" : {
                "tweet_filter" : {
                    "type" : "word_delimiter",

## .gitignore
nginx/
!nginx/.gitkeep
!nginx/logs/.gitkeep

src/

tmp/
	/**
	* @(#)ByteTokenizer.java Sep 23, 2008
	* Copyright (C) 2008 Duy Do. All Rights Reserved.
	*/
	package com.duydo.util;

	import java.util.Enumeration;
	import java.util.NoSuchElementException;

	/**
	# Use this script to test that your Telegram bot works.
	#
	# Install the dependency
	#
	# $ gem install telegram_bot
	#
	# Run the bot
	#
	# $ ruby bot.rb
	#
	If you want, I can try and help with pointers as to how to improve the indexing speed you get. Its quite easy to really increase it by using some simple guidelines, for example:

	- Use create in the index API (assuming you can).
	- Relax the real time aspect from 1 second to something a bit higher (index.engine.robin.refresh_interval).
	- Increase the indexing buffer size (indices.memory.index_buffer_size), it defaults to the value 10% which is 10% of the heap.
	- Increase the number of dirty operations that trigger automatic flush (so the translog won't get really big, even though its FS based) by setting index.translog.flush_threshold (defaults to 5000).
	- Increase the memory allocated to elasticsearch node. By default its 1g.
	- Start with a lower replica count (even 0), and then once the bulk loading is done, increate it to the value you want it to be using the update_settings API. This will improve things as possibly less shards will be allocated to each machine.
	- Increase the number of machines you have so
	Decoding the data in /proc/net/tcp:

	Linux 5.x /proc/net/tcp
	Linux 6.x /proc/PID/net/tcp

	Given a socket:

	$ ls -l /proc/24784/fd/11
	lrwx------ 1 jkstill dba 64 Dec 4 16:22 /proc/24784/fd/11 -> socket:[15907701]
	##################################################################
	# /etc/elasticsearch/elasticsearch.yml
	#
	# Base configuration for a write heavy cluster
	#

	# Cluster / Node Basics
	cluster.name: logng

	# Node can have abritrary attributes we can use for routing
	import operator
	from pprint import pprint

	def is_dict(d):
	return isinstance(d, dict)

	def get(c, k, default=None):
	try:
	return c[k]
	except (IndexError, KeyError, TypeError):
	{
	"analysis": {
	"filter": {
	"ar_stop_filter": {
	"type": "stop",
	"stopwords": ["_arabic_"]
	},
	"bg_stop_filter": {
	"type": "stop",
	"stopwords": ["_bulgarian_"]
	PUT test_index
	{
	"settings": {
	"number_of_shards": 1,
	"number_of_replicas": 0,
	"analysis": {
	"analyzer": {
	"test_analyzer": {
	"type":"custom",
	"tokenizer": "whitespace",
	curl -XPUT 'http://localhost:9200/twitter' -d '{
	"settings" : {
	"index" : {
	"number_of_shards" : 1,
	"number_of_replicas" : 1
	},
	"analysis" : {
	"filter" : {
	"tweet_filter" : {
	"type" : "word_delimiter",