Skip to content

Instantly share code, notes, and snippets.

package HiveUDF;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.hive.ql.exec.UDF;
public final class FindPattern extends UDF
{
public static String evaluate(String targetString, String strPattern)
{
// Pattern matching on the input pattern
Pattern p = Pattern.compile(strPattern);
CREATE EXTERNAL TABLE IF NOT EXISTS WebFeeds
(
UserID int,
Message String
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 'wasb://install@myprimarystorage.blob.core.windows.net/webfeeds';
param
(
#############################################################################
### Please edit the values of the parameters below for your configuration ###
#############################################################################
[string]$PrimarySubscriptionName="<Your Subscription Name>", # Replace <Your Subscription Name> with your Azure subscription name
[string]$HDInsightClusterLocation="West US", # This is the data center where you are going to provision this HDInsight cluster
[string]$HDInsightClusterName="MyHDICluster", # This is the name that you want for the HDInsight cluster
#############################################################################
### Please edit the values of the parameters below for your configuration ###
#############################################################################
$subscriptionName = "<Your Azure Subscription Name>"
$clusterName = "MyHDICluster" # HDInsight Cluster Name
############################################################################
### End Edits ###
############################################################################
<coordinator-app name="MY_APP" frequency="${coord:months(1)}" start="${jobStart}" end="${jobEnd}" timezone="UTC" xmlns="uri:oozie:coordinator:0.3">
<datasets>
<dataset name="input1" frequency="${coord:months(1)}" initial-instance="${initialInstance}" timezone="UTC">
<uri-template>hcat://headnode0:9083/default/samplelog/dt=${YEAR}-${MONTH}</uri-template>
</dataset>
</datasets>
<input-events>
<data-in name="coordInput1" dataset="input1">
<instance>${coord:current(1)}</instance>
</data-in>
<workflow-app xmlns="uri:oozie:workflow:0.2" name="hive-wf">
<start to="hive-node"/>
<action name="hive-node">
<hive xmlns="uri:oozie:hive-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>default</value>
ALTER TABLE samplelog DROP IF EXISTS PARTITION (dt<'${DROPDT}');
#region edits
# This is the name of the Azure HDInsight Subscription
[string]$SubscriptionName = "Contoso2014"
# This is the HDInsight Cluster Name that you want to work with
[string]$HDInsightClusterName = "ServerLogs"
# The table specified here will be dropped, please ensure that this is a test table
[string]$TableName = "samplelog"
# This is the location where the external table's data will be located on WASB
#region edits
# Your HDInsight Cluster Name
$HDIClusterName = "ServerLogs"
# Your HDInisght Cluster Admin User Name
$MyHDInsightUserName = "sysadmin"
# Your HDInsight Cluster Admin Password
$MyHdInsightPwd = "xxxx"
# Your HDInsight Metastore User Name
##################### Begin Edits ####################
#region edits
param
(
# NOTE: All the storage accounts and containers need to be created on the same data center as the HDInsight cluster and would need to be created prior to running the script
# They can be created from the Azure Management Portal
# This is the name of your Azure Subscription that will be used for provisiong Azure HDInsight
[string]$PrimarySubscriptionName="xxx",
# This is the primary storage account that needs to be created on the same data center as your HDInsight Cluster