Created
July 13, 2015 06:21
-
-
Save prongs/0d8ab2af9d50532eaad8 to your computer and use it in GitHub Desktop.
lens demo commands
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
showterm | |
cd ~/Git/incubator-lens | |
export PS1="\u@\h:\w [$?]\n\$" | |
# Now directory is on one line, command on next | |
# compile: Not running right now. http://showterm.io/48a63d963385a442e9978 | |
# mvn clean install -DskipTests -Dcheckstyle.skip -Dfindbugs.skip=true -DskipCheck | |
cd lens-dist/target/*bin/*bin/ | |
echo $HADOOP_HOME | |
echo $HIVE_HOME | |
ls | |
cd server | |
# kill any existing zombie server | |
ps -ef | grep LensServer | grep -v grep | awk '{print $2}' | xargs -L1 kill -9 | |
# restart and wait till restarted | |
bin/lens-ctl restart && sleep 5 && tail -f logs/lensserver.log | sed '/0.0.0.0:9999/q' | |
# let's go to client and open cli for demo | |
cd ../client | |
# first, let's convert paths to absolute paths in some of our files | |
; They are files of partition description. We'll use them later. | |
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/product-local-parts.xml | |
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/city-local-part.xml | |
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/customer-local-part.xml | |
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/sales-raw-local-parts.xml | |
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/sales-aggr-fact1-local-parts.xml | |
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/sales-aggr-fact2-local-parts.xml | |
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/sales-aggr-fact1-mydb-parts.xml | |
sed -i.bak s/location=\"e/location=\"file:\\/\\/\\/Users\\/rajat.khandelwal\\/Git\\/incubator-lens\\/lens-dist\\/target\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/apache-lens-2.3.0-beta-incubating-SNAPSHOT-bin\\/client\\/e/g examples/resources/sales-aggr-fact2-mydb-parts.xml | |
bin/lens-cli.sh | |
; This is a comment | |
; you can run shell commands here by beginning with ! | |
; e.g.: | |
! ls | |
! ls examples | |
! ls examples/data | |
! ls examples/resources | |
; fresh db: | |
use newdb | |
; Let's discuss the basics in the process of registering and querying data | |
; Storage: storage represents a physical storage. It can be Hadoop File system or a data base. It is defined by name, endpoint and properties associated with. | |
; let's create some storages: | |
! grep -A 33333333 <x_ examples/resources/local-storage.xml | |
create storage examples/resources/local-storage.xml | |
; Second dimension: | |
! grep -A 33333333 <x_ examples/resources/local-cluster-storage.xml | |
create storage examples/resources/local-cluster-storage.xml | |
; Third dimension: | |
! grep -A 33333333 <x_ examples/resources/db-storage.xml | |
create storage examples/resources/db-storage.xml | |
; See the three storages: | |
show storages | |
; Every entity generally will have show command in lens cli. | |
; Lens Entities | |
; Logical tables: Collection of Columns. | |
; Column can either be a Dim Attribute, A Measure, or an Expression | |
; Dimensions and Cubes are logical tables | |
; Dimensions: A logical table having Only Dim Attributes and Expressions over them | |
; Cubes: Logical tables having Dim Attributes, Measures and Expressions on them. Mostly expressions will be only on measures. | |
; Both Cube and Dimension can have a special kind of dim attribute called Reference Dim Attribute. It can reference another Logical table's Dim Attribute. | |
; E.g. Cube will have country id as a direct dim attribute and country name as ref dim attribute. | |
; Complex references: Join Chains. Path from source table to destination column | |
; Sometimes direct relation is not possible | |
; Or | |
; Sometimes it's not unique, There are multiple paths to reach to the same table | |
; It's logically possible to relate two physical tables through two diferent paths | |
; Then you can specify explicit paths between tables as join chains | |
; We'll create a sales cube and related dimensions. | |
; Dimensions first: | |
; City dimension: | |
! grep -A 33333333 <x_ examples/resources/city.xml | |
create dimension examples/resources/city.xml | |
; customer dimension: | |
; Entity names are case insensitive. | |
! grep -A 33333333 <x_ examples/resources/customer.xml | |
create dimension examples/resources/customer.xml | |
; Product dimension | |
! grep -A 33333333 <x_ examples/resources/product.xml | |
create dimension examples/resources/product.xml | |
; Listing all the dimensions we created: | |
show dimensions | |
; Queryable fields of dimensions | |
dimension show fields city | |
dimension show fields customer | |
; We can view join chains of dimensions: | |
dimension show joinchains customer | |
; Flattened view of the table shows direct fields as well as fields queryable via join chains | |
dimension show fields customer --flattened | |
; Example cubes: | |
; We'll create a sales cube. | |
; Cube xml file is pretty big, it contains dim attributes, measures, and join chains | |
; And other important properties, e.g. relation between time dimensions. More on this later | |
! grep -A 33333333 <x_ examples/resources/sales-cube.xml | |
; as mentioned before, it has dim attributes, measures, expressions and join chains. | |
; some joinchain.fieldname can be directly made available in cube. | |
; e.g. production_city_name in above cube | |
create cube examples/resources/sales-cube.xml | |
show cubes | |
; you can see queryable fields of a cube | |
cube show fields sales | |
; Similarly, join chains: | |
cube show joinchains sales | |
; Here, customer_city, delivery_city, and production_city all begin at sales cube and reach to city dimension | |
; But paths are different. They use different columns of the cube. Hence join chains need to be defined | |
; Some fields have been directly made available. e.g. production_city_name. | |
; And finally, the flattened view | |
cube show fields sales --flattened | |
; Physical Tables | |
; Logical tables are about collection of columns, and relations between them etc. | |
; Physical tables are about layout of the data | |
; What all storages it's stored in, with what order of columns | |
; What are partition columns of the data, etc | |
; Cubes have facts, Dimensions have Dimtables | |
; Facts can have subset of fields of a cube | |
; Facts belong to a cube. | |
; Similar relation between Dimtables and Dimensions | |
; Cubes have time dimensions, facts have partition columns corresponding to time dimensions | |
! grep timed examples/resources/sales-cube.xml | |
; So the sales cube has three time dimensions | |
; Each has it's corresponding partition column | |
; A fact of this cube might or might not be partitioned by these partition columns | |
; Time timensions are the ones that should be used for querying | |
; partitions are mentioned when registering data in facts or dimtables | |
; The translation is done by lens, as specified in the cube xml | |
; Lens sees which time dimensions are queried, and then preferrably chooses the facts which | |
; are partitioned by the time dimension's partition column. | |
; The fallback strategy will be discussed later | |
; Dim tables are generally assumed to be snapshots | |
; So they will be available with only one partition column | |
; Though there is no restriction as of now. | |
; One dim table is one selection of columns of the dimension. | |
; The selection means the set of columns and the order | |
; The selection of columns can be stored on multiple storages | |
; But a different selection will be a separate dimtable. | |
; Create dim tables | |
; City has two different dim tables, one for each storage | |
! grep -A 33333333 <x_ examples/resources/city_table.xml | |
create dimtable examples/resources/city_table.xml | |
! grep -A 33333333 <x_ examples/resources/city_subset.xml | |
create dimtable examples/resources/city_subset.xml | |
; similarly product | |
; Now the following product table is partitioned by one time partition, and one non time partition | |
; this is an available decision for users of lens. | |
; data need not be only partitioned by time partition columns | |
! grep -A 33333333 <x_ examples/resources/product_table.xml | |
create dimtable examples/resources/product_table.xml | |
! grep -A 33333333 <x_ examples/resources/product_db_table.xml | |
create dimtable examples/resources/product_db_table.xml | |
; customer has only one dimtable which is stored on both storages with same schema | |
! grep -A 33333333 <x_ examples/resources/customer_table.xml | |
create dimtable examples/resources/customer_table.xml | |
show dimtables | |
; We can see dimtables of a particular dimension too: | |
show dimtables city | |
; create facts | |
; raw fact available only on local storage with hourly granularity, partitioned by all three part cols. | |
! grep -A 33333333 <x_ examples/resources/sales-raw-fact.xml | |
create fact examples/resources/sales-raw-fact.xml | |
; aggr fact 1 available on both local and mydb with differnet granularities and both having three partitions | |
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact1.xml | |
create fact examples/resources/sales-aggr-fact1.xml | |
; aggr fact 2 on local has only one partition, on mydb it has three partitions. granularities are different | |
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact2.xml | |
create fact examples/resources/sales-aggr-fact2.xml | |
show facts | |
; facts of one cube: | |
show facts sales | |
; All facts belonged to same cube | |
; Adding data | |
; Data is to be added as partitions in dimtables and facts | |
; there are two commands for add partition: | |
; you can add single partition or multiple partitions | |
; this will be clear in following commands | |
; dimtable partitions: | |
; product local dimtable has two partitions | |
; So each partition to be registered has values for both part cols: | |
! grep -A 33333333 <x_ examples/resources/product-local-parts.xml | |
dimtable add partitions --dimtable_name product_table --storage_name local --path examples/resources/product-local-parts.xml | |
; city local has only one part col. And we're only registering a single partition. not a partition list | |
! grep -A 33333333 <x_ examples/resources/city-local-part.xml | |
dimtable add single-partition --dimtable_name city_table --storage_name local --path examples/resources/city-local-part.xml | |
; similar deal for customer table. | |
! grep -A 33333333 <x_ examples/resources/customer-local-part.xml | |
dimtable add single-partition --dimtable_name customer_table --storage_name local --path examples/resources/customer-local-part.xml | |
; If you noticed, we only registered partitions for local storage | |
; We're only demonstrating one storage here. | |
; fact partitions: | |
; two partitions for raw, has 3 time part cols: | |
! grep -A 33333333 <x_ examples/resources/sales-raw-local-parts.xml | |
fact add partitions --fact_name sales_raw_fact --storage_name local --path examples/resources/sales-raw-local-parts.xml | |
; three parts for aggr fact1 on local storage | |
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact1-local-parts.xml | |
fact add partitions --fact_name sales_aggr_fact1 --storage_name local --path examples/resources/sales-aggr-fact1-local-parts.xml | |
; fact 2 on local has one partition column. Partitions are registered for DAILY and HOURLY update periods: | |
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact2-local-parts.xml | |
fact add partitions --fact_name sales_aggr_fact2 --storage_name local --path examples/resources/sales-aggr-fact2-local-parts.xml | |
; for db storage, registering partitions for aggr fact1 and fact 2 : | |
; fact1 has three time part cols on mydb. Registreing a singleton list of partitions. | |
; Can be registered as single_partition too, but the xml will be a little different | |
; the x_partition_list wrapper tags will have to be removed. | |
; For now, let's register the singleton list: | |
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact1-mydb-parts.xml | |
fact add partitions --fact_name sales_aggr_fact1 --storage_name mydb --path examples/resources/sales-aggr-fact1-mydb-parts.xml | |
; fact2 also has three time part cols on mydb | |
! grep -A 33333333 <x_ examples/resources/sales-aggr-fact2-mydb-parts.xml | |
fact add partitions --fact_name sales_aggr_fact2 --storage_name mydb --path examples/resources/sales-aggr-fact2-mydb-parts.xml | |
; I've not shown you the actual data. The partitions only mention location of data | |
; Which is in examples/data | |
; If you want, you can look there yourself. (Homework :P) | |
; Now We can run queries on sales cube and the three dimensions. | |
; While loading data, We had associated it to partitions of tables. | |
; While querying, we can either use the given partition column | |
; Or we can use the time dimension of the partition column | |
; If the time dimension maps to a partition column, it's preferable to proceed with a fact that has that partition column | |
query execute cube select product_id, store_sales from sales where time_range_in(order_time, '2015-04-11-00', '2015-04-13-00') | |
query execute cube select product_id, store_sales from sales where time_range_in(order_time, '2015-04-11-00', '2015-04-13-01') | |
query execute cube select product_id, store_sales from sales where time_range_in(order_time, '2015-04-12-00', '2015-04-13-00') | |
query execute cube select product_id, store_sales from sales where time_range_in(order_time, '2015-04-13-00', '2015-04-13-02') | |
query execute cube select product_id, store_sales from sales where time_range_in(delivery_time, '2015-04-11-00', '2015-04-13-00') | |
query execute cube select product_id, store_sales from sales where time_range_in(delivery_time, '2015-04-12-00', '2015-04-13-00') | |
query execute cube select promotion_sales, store_sales from sales where time_range_in(order_time, '2015-04-13-00', '2015-04-13-02') | |
query execute cube select promotion_sales, store_sales from sales where time_range_in(order_time, '2015-04-13-00', '2015-04-13-01') | |
query execute cube select customer_city_name, store_sales from sales where time_range_in(delivery_time, '2015-04-12-00', '2015-04-13-00') | |
query execute cube select customer_city_name, store_sales from sales where time_range_in(delivery_time, '2015-04-11-00', '2015-04-13-00') | |
query execute cube select customer_city_name, delivery_city.name, production_city.name, store_sales from sales where time_range_in(delivery_time, '2015-04-11-00', '2015-04-13-00') | |
; in product table, category was a non time partition | |
; We can use that in queries too: | |
query execute cube select product_details.color, store_sales from sales where time_range_in(order_time, '2015-04-11-00', '2015-04-13-00') and product_details.category='Stationary' | |
query execute cube select product_details.category, store_sales from sales where time_range_in(order_time, '2015-04-11-00', '2015-04-13-01') | |
query execute cube select product_details.color, store_sales from sales where time_range_in(order_time, '2015-04-12-00', '2015-04-13-00')and product_details.category='Stationary' | |
query execute cube select product_details.category, store_sales from sales where time_range_in(order_time, '2015-04-11-00', '2015-04-13-01') | |
query execute cube select product_details.category, store_sales from sales where time_range_in(order_time, '2015-04-12-00', '2015-04-13-00') | |
query execute cube select product_details.color, store_sales from sales where time_range_in(delivery_time, '2015-04-11-00', '2015-04-13-00') and product_details.category='Stationary' | |
query execute cube select product_details.color, store_sales from sales where time_range_in(delivery_time, '2015-04-12-00', '2015-04-13-00') and product_details.category='Stationary' | |
; The following query sees that ot part col is not present and falls back on querying on dt part col | |
; If no fact has partition column, something called time dim relation is used | |
! grep relation examples/resources/sales-cube.xml | |
; It means that order time can be between delivery time -20 days to delivery time - 1 hour | |
; It means that if a query comes on order time and no fact has order time partitions for the queried range | |
; Then the relation will be used and corresponding range for relating time dim will be formed | |
; And partition column of the relating time dim will be picked for querying | |
query execute cube select customer_city_name, store_cost from sales where time_range_in(order_time, '2015-04-13-03', '2015-04-13-04') | |
; exit the cli | |
bye | |
# finish demo | |
exit | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment