Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
lens demo commands
showterm
cd ~/Git/incubator-lens
export PS1="\u@\h:\w [$?]\n\$"
# Now directory is on one line, command on next
# compile: Not running right now. http://showterm.io/48a63d963385a442e9978
# mvn clean install -DskipTests -Dcheckstyle.skip -Dfindbugs.skip=true -DskipCheck
cd lens-dist/target/*bin/*bin/
echo $HADOOP_HOME
echo $HIVE_HOME
ls
cd server
# kill any existing zombie server
ps -ef | grep LensServer | grep -v grep | awk '{print $2}' | xargs -L1 kill -9
# restart and wait till restarted
bin/lens-ctl restart && sleep 5 && tail -f logs/lensserver.log | sed '/0.0.0.0:9999/q'
# let's go to client and open cli for demo
cd ../client
# first, let's convert paths to absolute paths in some of our files
; They are files of partition description. We'll use them later.
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/product-local-parts.xml
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/city-local-part.xml
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/customer-local-part.xml
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/sales-raw-local-parts.xml
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/sales-aggr-fact1-local-parts.xml
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/sales-aggr-fact2-local-parts.xml
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/sales-aggr-fact1-mydb-parts.xml
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/sales-aggr-fact2-mydb-parts.xml
bin/lens-cli.sh
; This is a comment
; you can run shell commands here by beginning with !
; e.g.:
! ls
! ls examples
! ls examples/data
! ls examples/resources
; fresh db:
use newdb
; Let's discuss the basics in the process of registering and querying data
; Storage: storage represents a physical storage. It can be Hadoop File system or a data base. It is defined by name, endpoint and properties associated with.
; let's create some storages:
! grep -A 33333333 <x_ examples/resources/local-storage.xml
create storage examples/resources/local-storage.xml
; Second dimension:
! grep -A 33333333 <x_ examples/resources/local-cluster-storage.xml
create storage examples/resources/local-cluster-storage.xml
; Third dimension:
! grep -A 33333333 <x_ examples/resources/db-storage.xml
create storage examples/resources/db-storage.xml
; See the three storages:
show storages
; Every entity generally will have show command in lens cli.
; Lens Entities
; Logical tables: Collection of Columns.
; Column can either be a Dim Attribute, A Measure, or an Expression
; Dimensions and Cubes are logical tables
; Dimensions: A logical table having Only Dim Attributes and Expressions over them
; Cubes: Logical tables having Dim Attributes, Measures and Expressions on them. Mostly expressions will be only on measures.
; Both Cube and Dimension can have a special kind of dim attribute called Reference Dim Attribute. It can reference another Logical table's Dim Attribute.
; E.g. Cube will have country id as a direct dim attribute and country name as ref dim attribute.
; Complex references: Join Chains. Path from source table to destination column
; Sometimes direct relation is not possible
; Or
; Sometimes it's not unique, There are multiple paths to reach to the same table
; It's logically possible to relate two physical tables through two diferent paths
; Then you can specify explicit paths between tables as join chains
; More on this later
; We'll create a sales cube and related dimensions.
; Dimensions first:
; City dimension:
! grep -A 33333333 <x_ examples/resources/city.xml
create dimension examples/resources/city.xml
; customer dimension:
; Entity names are case insensitive.
! grep -A 33333333 <x_ examples/resources/customer.xml
create dimension examples/resources/customer.xml
; Product dimension
! grep -A 33333333 <x_ examples/resources/product.xml
create dimension examples/resources/product.xml
; Listing all the dimensions we created:
show dimensions
; Queryable fields of dimensions
dimension show fields city
dimension show fields customer
; We can view join chains of dimensions:
dimension show joinchains customer
; Flattened view of the table shows direct fields as well as fields queryable via join chains
dimension show fields customer --flattened
; Example cubes:
; We'll create a sales cube.
; Cube xml file is pretty big, it contains dim attributes, measures, and join chains
; And other important properties, e.g. relation between time dimensions. More on this later
! grep -A 33333333 <x_ examples/resources/sales-cube.xml
; as mentioned before, it has dim attributes, measures, expressions and join chains.
; some joinchain.fieldname can be directly made available in cube.
; e.g. production_city_name in above cube
create cube examples/resources/sales-cube.xml
show cubes
; you can see queryable fields of a cube
cube show fields sales
; Similarly, join chains:
cube show joinchains sales
; Here, customer_city, delivery_city, and production_city all begin at sales cube and reach to city dimension
; But paths are different. They use different columns of the cube. Hence join chains need to be defined
; Some fields have been directly made available. e.g. production_city_name.
; And finally, the flattened view
cube show fields sales --flattened
; Physical Tables
; Logical tables are about collection of columns, and relations between them etc.
; Physical tables are about layout of the data
; What all storages it's stored in, with what order of columns
; What are partition columns of the data, etc
; Cubes have facts, Dimensions have Dimtables
; Facts can have subset of fields of a cube
; Facts belong to a cube.
; Similar relation between Dimtables and Dimensions
; Cubes have time dimensions, facts have partition columns corresponding to time dimensions
! grep timed examples/resources/sales-cube.xml
; So the sales cube has three time dimensions
; Each has it's corresponding partition column
; A fact of this cube might or might not be partitioned by these partition columns
; Time timensions are the ones that should be used for querying
; partitions are mentioned when registering data in facts or dimtables
; The translation is done by lens, as specified in the cube xml
; Lens sees which time dimensions are queried, and then preferrably chooses the facts which
; are partitioned by the time dimension's partition column.
; The fallback strategy will be discussed later
; Dim tables are generally assumed to be snapshots
; So they will be available with only one partition column
; Though there is no restriction as of now.
; One dim table is one selection of columns of the dimension.
; The selection means the set of columns and the order
; The selection of columns can be stored on multiple storages
; But a different selection will be a separate dimtable.
; Create dim tables
; City has two different dim tables, one for each storage
! grep -A 33333333 <x_ examples/resources/city_table.xml
create dimtable examples/resources/city_table.xml
! grep -A 33333333 <x_ examples/resources/city_subset.xml
create dimtable examples/resources/city_subset.xml
; similarly product
; Now the following product table is partitioned by one time partition, and one non time partition
; this is an available decision for users of lens.
; data need not be only partitioned by time partition columns
! grep -A 33333333 <x_ examples/resources/product_table.xml
create dimtable examples/resources/product_table.xml
! grep -A 33333333 <x_ examples/resources/product_db_table.xml
create dimtable examples/resources/product_db_table.xml
; customer has only one dimtable which is stored on both storages with same schema
! grep -A 33333333 <x_ examples/resources/customer_table.xml
create dimtable examples/resources/customer_table.xml
show dimtables
; We can see dimtables of a particular dimension too:
show dimtables city
; create facts
; raw fact available only on local storage with hourly granularity, partitioned by all three part cols.
! grep -A 33333333 <x_ examples/resources/sales-raw-fact.xml
create fact examples/resources/sales-raw-fact.xml
; aggr fact 1 available on both local and mydb with differnet granularities and both having three partitions
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact1.xml
create fact examples/resources/sales-aggr-fact1.xml
; aggr fact 2 on local has only one partition, on mydb it has three partitions. granularities are different
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact2.xml
create fact examples/resources/sales-aggr-fact2.xml
show facts
; facts of one cube:
show facts sales
; All facts belonged to same cube
; Adding data
; Data is to be added as partitions in dimtables and facts
; there are two commands for add partition:
; you can add single partition or multiple partitions
; this will be clear in following commands
; dimtable partitions:
; product local dimtable has two partitions
; So each partition to be registered has values for both part cols:
! grep -A 33333333 <x_ examples/resources/product-local-parts.xml
dimtable add partitions --dimtable_name product_table --storage_name local --path examples/resources/product-local-parts.xml
; city local has only one part col. And we're only registering a single partition. not a partition list
! grep -A 33333333 <x_ examples/resources/city-local-part.xml
dimtable add single-partition --dimtable_name city_table --storage_name local --path examples/resources/city-local-part.xml
; similar deal for customer table.
! grep -A 33333333 <x_ examples/resources/customer-local-part.xml
dimtable add single-partition --dimtable_name customer_table --storage_name local --path examples/resources/customer-local-part.xml
; If you noticed, we only registered partitions for local storage
; We're only demonstrating one storage here.
; fact partitions:
; two partitions for raw, has 3 time part cols:
! grep -A 33333333 <x_ examples/resources/sales-raw-local-parts.xml
fact add partitions --fact_name sales_raw_fact --storage_name local --path examples/resources/sales-raw-local-parts.xml
; three parts for aggr fact1 on local storage
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact1-local-parts.xml
fact add partitions --fact_name sales_aggr_fact1 --storage_name local --path examples/resources/sales-aggr-fact1-local-parts.xml
; fact 2 on local has one partition column. Partitions are registered for DAILY and HOURLY update periods:
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact2-local-parts.xml
fact add partitions --fact_name sales_aggr_fact2 --storage_name local --path examples/resources/sales-aggr-fact2-local-parts.xml
; for db storage, registering partitions for aggr fact1 and fact 2 :
; fact1 has three time part cols on mydb. Registreing a singleton list of partitions.
; Can be registered as single_partition too, but the xml will be a little different
; the x_partition_list wrapper tags will have to be removed.
; For now, let's register the singleton list:
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact1-mydb-parts.xml
fact add partitions --fact_name sales_aggr_fact1 --storage_name mydb --path examples/resources/sales-aggr-fact1-mydb-parts.xml
; fact2 also has three time part cols on mydb
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact2-mydb-parts.xml
fact add partitions --fact_name sales_aggr_fact2 --storage_name mydb --path examples/resources/sales-aggr-fact2-mydb-parts.xml
; I've not shown you the actual data. The partitions only mention location of data
; Which is in examples/data
; If you want, you can look there yourself. (Homework :P)
; Now We can run queries on sales cube and the three dimensions.
; While loading data, We had associated it to partitions of tables.
; While querying, we can either use the given partition column
; Or we can use the time dimension of the partition column
; If the time dimension maps to a partition column, it's preferable to proceed with a fact that has that partition column
query execute cube select product_id, store_sales from sales where time_range_in(order_time, '2015-04-11-00', '2015-04-13-00')
query execute cube select product_id, store_sales from sales where time_range_in(order_time, '2015-04-11-00', '2015-04-13-01')
query execute cube select product_id, store_sales from sales where time_range_in(order_time, '2015-04-12-00', '2015-04-13-00')
query execute cube select product_id, store_sales from sales where time_range_in(order_time, '2015-04-13-00', '2015-04-13-02')
query execute cube select product_id, store_sales from sales where time_range_in(delivery_time, '2015-04-11-00', '2015-04-13-00')
query execute cube select product_id, store_sales from sales where time_range_in(delivery_time, '2015-04-12-00', '2015-04-13-00')
query execute cube select promotion_sales, store_sales from sales where time_range_in(order_time, '2015-04-13-00', '2015-04-13-02')
query execute cube select promotion_sales, store_sales from sales where time_range_in(order_time, '2015-04-13-00', '2015-04-13-01')
query execute cube select customer_city_name, store_sales from sales where time_range_in(delivery_time, '2015-04-12-00', '2015-04-13-00')
query execute cube select customer_city_name, store_sales from sales where time_range_in(delivery_time, '2015-04-11-00', '2015-04-13-00')
query execute cube select customer_city_name, delivery_city.name, production_city.name, store_sales from sales where time_range_in(delivery_time, '2015-04-11-00', '2015-04-13-00')
; in product table, category was a non time partition
; We can use that in queries too:
query execute cube select product_details.color, store_sales from sales where time_range_in(order_time, '2015-04-11-00', '2015-04-13-00') and product_details.category='Stationary'
query execute cube select product_details.category, store_sales from sales where time_range_in(order_time, '2015-04-11-00', '2015-04-13-01')
query execute cube select product_details.color, store_sales from sales where time_range_in(order_time, '2015-04-12-00', '2015-04-13-00')and product_details.category='Stationary'
query execute cube select product_details.category, store_sales from sales where time_range_in(order_time, '2015-04-11-00', '2015-04-13-01')
query execute cube select product_details.category, store_sales from sales where time_range_in(order_time, '2015-04-12-00', '2015-04-13-00')
query execute cube select product_details.color, store_sales from sales where time_range_in(delivery_time, '2015-04-11-00', '2015-04-13-00') and product_details.category='Stationary'
query execute cube select product_details.color, store_sales from sales where time_range_in(delivery_time, '2015-04-12-00', '2015-04-13-00') and product_details.category='Stationary'
; The following query sees that ot part col is not present and falls back on querying on dt part col
; If no fact has partition column, something called time dim relation is used
! grep relation examples/resources/sales-cube.xml
; It means that order time can be between delivery time -20 days to delivery time - 1 hour
; It means that if a query comes on order time and no fact has order time partitions for the queried range
; Then the relation will be used and corresponding range for relating time dim will be formed
; And partition column of the relating time dim will be picked for querying
query execute cube select customer_city_name, store_cost from sales where time_range_in(order_time, '2015-04-13-03', '2015-04-13-04')
; exit the cli
bye
# finish demo
exit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment