hivefans hivefans

## datetime_timestamp.py
#coding:UTF-8
import time

dt = "2016-05-05 20:28:54"
#转换成时间数组
timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
#转换成时间戳
timestamp = time.mktime(timeArray)

#转换成新的时间格式(20160505-20:28:54)

## demo.py
#!/usr/bin/env python
# encoding: utf-8

import MySQLdb
from upsert import upsert
db = MySQLdb.connect(host="localhost", user="root", passwd="", db="demo", charset="utf8")
c = db.cursor()

import warnings
warnings.filterwarnings("ignore", "Unknown table.*")

## Spark Dataframe Cheat Sheet.py
# A simple cheat sheet of Spark Dataframe syntax
# Current for Spark 1.6.1

# import statements
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

#creating dataframes
df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data

## upsert_table.sql
/*
references:
- https://dev.mysql.com/doc/refman/5.7/en/insert-on-duplicate.html
- https://stackoverflow.com/questions/32777081/bulk-insert-and-update-in-mysql
- https://thewebfellas.com/blog/conditional-duplicate-key-updates-with-mysql
*/

/* create a new database and use it */
drop database if exists test_upsert;
create database test_upsert;

## hbase.rest.scanner.filters.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                hivefans
                / hbase.rest.scanner.filters.md
            
            
              Last active
              March 17, 2020 02:03
                — forked from stelcheck/hbase.rest.scanner.filters.md
            
              
                HBase Stargate REST API Scanner Filter Examples|-|{"files":{"hbase.rest.scanner.filters.md":{"env":"plain"}},"tag":"bigdata"}
              
          
    Stargate Scanner Filter Examples

Introduction

So yeah... no documentation for the HBase REST API in regards to what should a filter look like...
So I installed Eclipse, got the library, and took some time to find some of the (seemingly) most useful filters you could use. I'm very green at anything regarding HBase, and I hope this will help anyone trying to get started with it.
What I discovered is that basically, attributes of the filter object follow the same naming than in the documentation. For this reason, I have made the link clickable and direct them to the HBase Class documentation attached to it; check for the instantiation argument names, and you will have your attribute list (more or less).

  
## hbase-rest-examples.sh
#!/usr/bin/env bash
#
# ===================================
# Experiments with the HBase REST API
# ===================================
#
# <http://hbase.apache.org/docs/r0.20.4/api/org/apache/hadoop/hbase/rest/package-summary.html>
#
# Usage:
#

## spark_gpkey_comkey
pairRdd中最好不要用groupByKey,因为groupBy类函数会使用shuffl带来性能问题，所以pairRdd一般使用combineByKey:
示例：
使用前rdd格式: JavaPairRDD<String, HotsCompare>
	pairRdd2 = pairRdd.combineByKey(e -> {
			ArrayList<HotsCompare> list = new ArrayList<HotsCompare>();
			list.add(e);
			return list;
		}, (list, e) -> {
			list.add(e);
			return list;

## pyrdd_access_javardd.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                hivefans
                / pyrdd_access_javardd.md
            
            
              Last active
              March 17, 2020 02:03
                — forked from yu-iskw/testing.md
            
              
                PySpark serializer and deserializer testing with a nested and complicated value|-|{"files":{"pyrdd_access_javardd.md":{"env":"plain"}},"tag":"bigdata"}
              
          
    Python =(parallelize)=> RDD =(collect)=> Python

It works well.
>>> sc = SparkContext('local', 'test', batchSize=2)
>>> data = [([1, 0], [0.5, 0.499]), ([0, 1], [0.5, 0.499])]
>>> rdd = sc.parallelize(data)
>>> rdd.collect()
[([1, 0], [0.5, 0.499]), ([0, 1], [0.5, 0.499])]


## watch_log.py
#!/usr/bin/env python

"""
Real time log files watcher supporting log rotation.

Author: Giampaolo Rodola' <g.rodola [AT] gmail [DOT] com>
License: MIT
"""

import os

## NginxLineParser.scala
package spark.example

/**
  * Created by shidongjie on 2016/12/4.
  */
class NginxLineParser extends Serializable {
  private val regex = "([^-]*)\\s+-\\s+(\\S+)\\s+\\[(\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}:\\d{2}:\\d{2}:\\d{2}\\s+-\\d{4})\\]\\s+\"(.+)\"\\s+(\\d{1,}\\.\\d{3})\\s+(\\d+)\\s+\"([^\"]+)\"\\s+Agent\\[\"([^\"]+)\"\\]\\s+(-|\\d.\\d{3,})\\s+(\\S+)\\s+(\\d{1,}).*".r

  /**
    * @param record Assumed to be an Nginx access log.
	#coding:UTF-8
	import time

	dt = "2016-05-05 20:28:54"
	#转换成时间数组
	timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
	#转换成时间戳
	timestamp = time.mktime(timeArray)

	#转换成新的时间格式(20160505-20:28:54)
	#!/usr/bin/env python
	# encoding: utf-8

	import MySQLdb
	from upsert import upsert
	db = MySQLdb.connect(host="localhost", user="root", passwd="", db="demo", charset="utf8")
	c = db.cursor()

	import warnings
	warnings.filterwarnings("ignore", "Unknown table.*")
	# A simple cheat sheet of Spark Dataframe syntax
	# Current for Spark 1.6.1

	# import statements
	from pyspark.sql import SQLContext
	from pyspark.sql.types import *
	from pyspark.sql.functions import *

	#creating dataframes
	df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data
	/*
	references:
	- https://dev.mysql.com/doc/refman/5.7/en/insert-on-duplicate.html
	- https://stackoverflow.com/questions/32777081/bulk-insert-and-update-in-mysql
	- https://thewebfellas.com/blog/conditional-duplicate-key-updates-with-mysql
	*/

	/* create a new database and use it */
	drop database if exists test_upsert;
	create database test_upsert;
	#!/usr/bin/env bash
	#
	# ===================================
	# Experiments with the HBase REST API
	# ===================================
	#
	# <http://hbase.apache.org/docs/r0.20.4/api/org/apache/hadoop/hbase/rest/package-summary.html>
	#
	# Usage:
	#
	pairRdd中最好不要用groupByKey,因为groupBy类函数会使用shuffl带来性能问题，所以pairRdd一般使用combineByKey:
	示例：
	使用前rdd格式: JavaPairRDD<String, HotsCompare>
	pairRdd2 = pairRdd.combineByKey(e -> {
	ArrayList<HotsCompare> list = new ArrayList<HotsCompare>();
	list.add(e);
	return list;
	}, (list, e) -> {
	list.add(e);
	return list;
	#!/usr/bin/env python

	"""
	Real time log files watcher supporting log rotation.

	Author: Giampaolo Rodola' <g.rodola [AT] gmail [DOT] com>
	License: MIT
	"""

	import os
	package spark.example

	/**
	* Created by shidongjie on 2016/12/4.
	*/
	class NginxLineParser extends Serializable {
	private val regex = "([^-])\\s+-\\s+(\\S+)\\s+\\[(\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}:\\d{2}:\\d{2}:\\d{2}\\s+-\\d{4})\\]\\s+\"(.+)\"\\s+(\\d{1,}\\.\\d{3})\\s+(\\d+)\\s+\"([^\"]+)\"\\s+Agent\\[\"([^\"]+)\"\\]\\s+(-\|\\d.\\d{3,})\\s+(\\S+)\\s+(\\d{1,}).".r

	/**
	* @param record Assumed to be an Nginx access log.