Skip to content

Instantly share code, notes, and snippets.

View yawo's full-sized avatar

Yawo Kpotufe yawo

  • YawoSoft
  • Paris, France
View GitHub Profile
#!/bin/sh
#Split
grep AddToCartEvent events.tsv | sed -e "s/\([^,]*,\)\{2\}\([^,]*\),[^,]*,\([^,]*\),\([^,]*,\)\{3\}\([^,]*\),\([^,]*,\)\{5\}\([^,]*\).*/\2_\3_\5_\7,\3,\5,\7/g" >> addtocart.tsv
grep ProductDetailPageViewEvent events.tsv | sed -e "s/\([^,]*,\)\{2\}\([^,]*\),[^,]*,\([^,]*\),\([^,]*,\)\{3\}\([^,]*\),\([^,]*,\)\{5\}\([^,]*\).*/\2_\3_\5_\7,\3,\5,\7/g" >> view.tsv
#grep ProductDetailPageViewEvent events.tsv | sed -e 's/\([^,]*,\)\{2\}\([^,]*\),[^,]*,\([^,]*\),\([^,]*,\)\{3\}\([^,]*\),\([^,]*,\)\{5\}\([^,]*\).*/\2,\3,\5,\7/g' >> view.tsv
#Load in Hbase
$HBASE_HOME/bin/hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator=, -Dimporttsv.columns=HBASE_ROW_KEY,u:id,p:id,c:id addtocart addtocart.tsv
$HBASE_HOME/bin/hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator=, -Dimporttsv.columns=HBASE_ROW_KEY,u:id,p:id,c:id view view.tsv
bestLength = Integer.MAX_VALUE
bestVertex = null
magicPhrase = new Scanner(System.in).nextLine()
start = System.currentTimeMillis()
magicPhraseLength = magicPhrase.length()
voidState = [' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' '
,' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ']
stopTime = 1800
chars = [' ':0,'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8,'I':9,'J':10,'K':11,'L':12,
/**
* @author yawo
*
*/
package com.yawozone
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
require 'anemone'
class Fhf
puts 'Starting crawl..'
file = File.open('fhf.html', 'w')
file.write %Q{
<html>
<head>
<meta content='text/html; charset=utf-8' http-equiv='Content-Type'>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
@yawo
yawo / desmgidf.rb
Created March 28, 2015 16:52
Scraper for French IDF services of "médecine générale".
require 'anemone'
class Desmgidf
puts 'Starting crawl..'
file = File.open('desmgidf.html', 'w')
file.write %Q{
<html>
<head>
<meta content='text/html; charset=utf-8' http-equiv='Content-Type'>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">