Skip to content

Instantly share code, notes, and snippets.

@milimetric
Last active September 27, 2016 16:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save milimetric/301d7fec9a3234029c0f316e166301f5 to your computer and use it in GitHub Desktop.
Save milimetric/301d7fec9a3234029c0f316e166301f5 to your computer and use it in GitHub Desktop.
Queries to get simple metrics from mediawiki_history
select substring(event_timestamp, 0, 8) day,
count(*) `All namespaces`,
sum(if( page_namespace_latest = 0
,1, 0)) `Namespace Zero`,
sum(if( page_namespace_latest = 0
and revision_deleted_timestamp is null
,1, 0)) `Namespace Zero not Deleted`
from milimetric.mediawiki_history
where event_entity = 'revision'
and event_type = 'create'
group by substring(event_timestamp, 0, 8)
order by day
limit 10000000
;
select substring(event_timestamp, 0, 8) day,
count(*) `All namespaces`,
sum(if( page_namespace_latest = 0
,1, 0)) `Namespace Zero`,
sum(if( page_namespace_latest = 0
and revision_deleted_timestamp is null
,1, 0)) `Namespace Zero not Deleted`
from milimetric.mediawiki_history
where event_entity = 'page'
and event_type = 'create'
group by substring(event_timestamp, 0, 8)
order by day
limit 10000000
;
CREATE EXTERNAL TABLE `mediawiki_history`(
wiki_db string,
event_entity string,
event_type string,
event_timestamp string,
event_comment string,
event_user_id bigint,
event_user_text string,
event_user_text_latest string,
event_user_blocks array<string>,
event_user_blocks_latest array<string>,
event_user_groups array<string>,
event_user_groups_latest array<string>,
event_user_creation_timestamp string,
page_id bigint,
page_title string,
page_title_latest string,
page_namespace int,
page_namespace_latest int,
page_creation_timestamp string,
user_id bigint,
user_text string,
user_text_latest string,
user_blocks array<string>,
user_blocks_latest array<string>,
user_groups array<string>,
user_groups_latest array<string>,
user_creation_timestamp string,
revision_id bigint,
revision_parent_id bigint,
revision_minor_edit boolean,
revision_text_bytes bigint,
revision_text_bytes_diff bigint,
revision_text_sha1 string,
revision_content_model string,
revision_content_format string,
revision_is_deleted boolean,
revision_deleted_timestamp string,
revision_is_identity_reverted boolean,
revision_most_recent_identity_revert_timestamp string,
revision_is_identity_revert boolean
)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://analytics-hadoop/user/joal/mwhist_3/denorm_2'
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment