Skip to content

Instantly share code, notes, and snippets.

View 2shou's full-sized avatar

Gavin Zhang 2shou

View GitHub Profile
@2shou
2shou / custom_pipeline.py
Created January 8, 2015 07:36
custom item exporter of Scrapy
from scrapy.contrib.exporter import BaseItemExporter
from scrapy import signals, log
from pipeline_base import StorePipeline
from os.path import join
class CustomItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs, dont_fail=True)
@2shou
2shou / spark_log_extract.py
Last active August 8, 2019 13:30
log-extract script with Spark
# coding: utf-8
# command example:
# $ spark-submit spark_log_extract.py \
# --name test \
# --notblankkeys dn,stm,ev_ac,pg_url \
# --filterregex ".*(=ac_pl\`|=ac_dl\`).*" \
# --usegzip \
# /path/to/source \
# /path/to/atom \
@2shou
2shou / SparkLogExtract.scala
Last active October 20, 2015 13:35
log-extract program with Spark
import org.apache.spark.{SparkConf, SparkContext}
object LogExtract {
val keys = Array[String]("dn", "stm", "ev_ac", "v_title", "v_uri", "pg_url")
val notBlankKeys = Array[String]("dn", "stm", "ev_ac", "pg_url")
val filterRegex = ".*(ac_pl`|ac_dl`).*"
val useCompress = false
def process(line: String): String = {
@2shou
2shou / hiveserver2.py
Last active January 17, 2023 06:34
async client for hiveserver2
# coding: utf-8
from TCLIService import TCLIService
from TCLIService.ttypes import TOpenSessionReq, TGetTablesReq, TFetchResultsReq, \
TStatusCode, TGetResultSetMetadataReq, TGetColumnsReq, TType, TTypeId, \
TExecuteStatementReq, TGetOperationStatusReq, TFetchOrientation, TCloseOperationReq, \
TCloseSessionReq, TGetSchemasReq, TCancelOperationReq, THandleIdentifier, \
TOperationHandle, TOperationState
from thrift import Thrift
@2shou
2shou / lst_classifier.py
Last active December 5, 2017 07:43
LibShortText demo
# coding: utf-8
# __author__ = 'zhangguoze'
import sys
import jieba
from libshorttext.libshorttext.analyzer import *
from libshorttext.libshorttext.classifier import *
from libshorttext.libshorttext.converter import *
@2shou
2shou / sci_classifier.py
Last active March 6, 2019 07:23
scikit-learn nb example
# coding: utf-8
import sys
import jieba
import numpy
from sklearn import metrics
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import MultinomialNB