Skip to content

Instantly share code, notes, and snippets.

@ottomata
Created January 2, 2024 21:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ottomata/5557dd110a1aed4a535d942f001d21fb to your computer and use it in GitHub Desktop.
Save ottomata/5557dd110a1aed4a535d942f001d21fb to your computer and use it in GitHub Desktop.
EvolveHiveTable with comments output
spark3-submit --class org.wikimedia.analytics.refinery.job.refine.tool.EvolveHiveTable ./refinery-job/target/refinery-job-0.2.28-SNAPSHOT-shaded.jar --table=event.mediawiki_page_change_v1 --schema_uri=/mediawiki/page/change/latest --dry_run=true
24/01/02 21:49:53 INFO DataFrameToHive: Found difference in schemas for Hive table otto.mw_page_change0
Table schema:
root
-- _schema: string (nullable = true)
-- changelog_kind: string (nullable = true)
-- comment: string (nullable = true)
-- created_redirect_page: struct (nullable = true)
|-- is_redirect: boolean (nullable = true)
|-- namespace_id: long (nullable = true)
|-- page_id: long (nullable = true)
|-- page_title: string (nullable = true)
|-- revision_count: long (nullable = true)
-- dt: string (nullable = true)
-- meta: struct (nullable = true)
|-- domain: string (nullable = true)
|-- dt: string (nullable = true)
|-- id: string (nullable = true)
|-- request_id: string (nullable = true)
|-- stream: string (nullable = true)
|-- uri: string (nullable = true)
-- page: struct (nullable = true)
|-- is_redirect: boolean (nullable = true)
|-- namespace_id: long (nullable = true)
|-- page_id: long (nullable = true)
|-- page_title: string (nullable = true)
|-- revision_count: long (nullable = true)
|-- redirect_page_link: struct (nullable = true)
| |-- interwiki_prefix: string (nullable = true)
| |-- is_redirect: boolean (nullable = true)
| |-- namespace_id: long (nullable = true)
| |-- page_id: long (nullable = true)
| |-- page_title: string (nullable = true)
-- page_change_kind: string (nullable = true)
-- performer: struct (nullable = true)
|-- edit_count: long (nullable = true)
|-- groups: array (nullable = true)
| |-- element: string (containsNull = true)
|-- is_bot: boolean (nullable = true)
|-- is_system: boolean (nullable = true)
|-- is_temp: boolean (nullable = true)
|-- registration_dt: string (nullable = true)
|-- user_id: long (nullable = true)
|-- user_text: string (nullable = true)
-- prior_state: struct (nullable = true)
|-- page: struct (nullable = true)
| |-- is_redirect: boolean (nullable = true)
| |-- namespace_id: long (nullable = true)
| |-- page_id: long (nullable = true)
| |-- page_title: string (nullable = true)
| |-- revision_count: long (nullable = true)
|-- revision: struct (nullable = true)
| |-- comment: string (nullable = true)
| |-- content_slots: map (nullable = true)
| | |-- key: string
| | |-- value: struct (valueContainsNull = true)
| | | |-- content_body: string (nullable = true)
| | | |-- content_format: string (nullable = true)
| | | |-- content_model: string (nullable = true)
| | | |-- content_sha1: string (nullable = true)
| | | |-- content_size: long (nullable = true)
| | | |-- origin_rev_id: long (nullable = true)
| | | |-- slot_role: string (nullable = true)
| |-- editor: struct (nullable = true)
| | |-- edit_count: long (nullable = true)
| | |-- groups: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- is_bot: boolean (nullable = true)
| | |-- is_system: boolean (nullable = true)
| | |-- is_temp: boolean (nullable = true)
| | |-- registration_dt: string (nullable = true)
| | |-- user_id: long (nullable = true)
| | |-- user_text: string (nullable = true)
| |-- is_comment_visible: boolean (nullable = true)
| |-- is_content_visible: boolean (nullable = true)
| |-- is_editor_visible: boolean (nullable = true)
| |-- is_minor_edit: boolean (nullable = true)
| |-- rev_dt: string (nullable = true)
| |-- rev_id: long (nullable = true)
| |-- rev_parent_id: long (nullable = true)
| |-- rev_sha1: string (nullable = true)
| |-- rev_size: long (nullable = true)
-- revision: struct (nullable = true)
|-- comment: string (nullable = true)
|-- content_slots: map (nullable = true)
| |-- key: string
| |-- value: struct (valueContainsNull = true)
| | |-- content_body: string (nullable = true)
| | |-- content_format: string (nullable = true)
| | |-- content_model: string (nullable = true)
| | |-- content_sha1: string (nullable = true)
| | |-- content_size: long (nullable = true)
| | |-- origin_rev_id: long (nullable = true)
| | |-- slot_role: string (nullable = true)
|-- editor: struct (nullable = true)
| |-- edit_count: long (nullable = true)
| |-- groups: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- is_bot: boolean (nullable = true)
| |-- is_system: boolean (nullable = true)
| |-- is_temp: boolean (nullable = true)
| |-- registration_dt: string (nullable = true)
| |-- user_id: long (nullable = true)
| |-- user_text: string (nullable = true)
|-- is_comment_visible: boolean (nullable = true)
|-- is_content_visible: boolean (nullable = true)
|-- is_editor_visible: boolean (nullable = true)
|-- is_minor_edit: boolean (nullable = true)
|-- rev_dt: string (nullable = true)
|-- rev_id: long (nullable = true)
|-- rev_parent_id: long (nullable = true)
|-- rev_sha1: string (nullable = true)
|-- rev_size: long (nullable = true)
-- wiki_id: string (nullable = true)
-- is_wmf_domain: boolean (nullable = true)
-- normalized_host: struct (nullable = true)
|-- project_class: string (nullable = true)
|-- project: string (nullable = true)
|-- qualifiers: array (nullable = true)
| |-- element: string (containsNull = true)
|-- tld: string (nullable = true)
|-- project_family: string (nullable = true)
-- datacenter: string (nullable = true)
-- year: long (nullable = true)
-- month: long (nullable = true)
-- day: long (nullable = true)
-- hour: long (nullable = true)
Input schema:
root
-- $schema: string (nullable = false)
-- changelog_kind: string (nullable = false)
-- comment: string (nullable = true)
-- created_redirect_page: struct (nullable = true)
|-- is_redirect: boolean (nullable = true)
|-- namespace_id: long (nullable = true)
|-- page_id: long (nullable = false)
|-- page_title: string (nullable = false)
|-- revision_count: long (nullable = true)
-- dt: string (nullable = false)
-- meta: struct (nullable = false)
|-- domain: string (nullable = true)
|-- dt: string (nullable = true)
|-- id: string (nullable = true)
|-- request_id: string (nullable = true)
|-- stream: string (nullable = false)
|-- uri: string (nullable = true)
-- page: struct (nullable = false)
|-- is_redirect: boolean (nullable = true)
|-- namespace_id: long (nullable = true)
|-- page_id: long (nullable = false)
|-- page_title: string (nullable = false)
|-- redirect_page_link: struct (nullable = true)
| |-- interwiki_prefix: string (nullable = true)
| |-- is_redirect: boolean (nullable = true)
| |-- namespace_id: long (nullable = true)
| |-- page_id: long (nullable = true)
| |-- page_title: string (nullable = true)
|-- revision_count: long (nullable = true)
-- page_change_kind: string (nullable = false)
-- performer: struct (nullable = false)
|-- edit_count: long (nullable = true)
|-- groups: array (nullable = true)
| |-- element: string (containsNull = true)
|-- is_bot: boolean (nullable = true)
|-- is_system: boolean (nullable = true)
|-- is_temp: boolean (nullable = true)
|-- registration_dt: string (nullable = true)
|-- user_id: long (nullable = true)
|-- user_text: string (nullable = true)
-- prior_state: struct (nullable = true)
|-- page: struct (nullable = true)
| |-- is_redirect: boolean (nullable = true)
| |-- namespace_id: long (nullable = true)
| |-- page_id: long (nullable = true)
| |-- page_title: string (nullable = true)
| |-- revision_count: long (nullable = true)
|-- revision: struct (nullable = true)
| |-- comment: string (nullable = true)
| |-- content_slots: map (nullable = true)
| | |-- key: string
| | |-- value: struct (valueContainsNull = true)
| | | |-- content_body: string (nullable = true)
| | | |-- content_format: string (nullable = true)
| | | |-- content_model: string (nullable = true)
| | | |-- content_sha1: string (nullable = true)
| | | |-- content_size: long (nullable = true)
| | | |-- origin_rev_id: long (nullable = true)
| | | |-- slot_role: string (nullable = true)
| |-- editor: struct (nullable = true)
| | |-- edit_count: long (nullable = true)
| | |-- groups: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- is_bot: boolean (nullable = true)
| | |-- is_system: boolean (nullable = true)
| | |-- is_temp: boolean (nullable = true)
| | |-- registration_dt: string (nullable = true)
| | |-- user_id: long (nullable = true)
| | |-- user_text: string (nullable = true)
| |-- is_comment_visible: boolean (nullable = true)
| |-- is_content_visible: boolean (nullable = true)
| |-- is_editor_visible: boolean (nullable = true)
| |-- is_minor_edit: boolean (nullable = true)
| |-- rev_dt: string (nullable = true)
| |-- rev_id: long (nullable = true)
| |-- rev_parent_id: long (nullable = true)
| |-- rev_sha1: string (nullable = true)
| |-- rev_size: long (nullable = true)
-- revision: struct (nullable = false)
|-- comment: string (nullable = true)
|-- content_slots: map (nullable = true)
| |-- key: string
| |-- value: struct (valueContainsNull = true)
| | |-- content_body: string (nullable = true)
| | |-- content_format: string (nullable = true)
| | |-- content_model: string (nullable = true)
| | |-- content_sha1: string (nullable = true)
| | |-- content_size: long (nullable = true)
| | |-- origin_rev_id: long (nullable = true)
| | |-- slot_role: string (nullable = true)
|-- editor: struct (nullable = true)
| |-- edit_count: long (nullable = true)
| |-- groups: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- is_bot: boolean (nullable = true)
| |-- is_system: boolean (nullable = true)
| |-- is_temp: boolean (nullable = true)
| |-- registration_dt: string (nullable = true)
| |-- user_id: long (nullable = true)
| |-- user_text: string (nullable = true)
|-- is_comment_visible: boolean (nullable = true)
|-- is_content_visible: boolean (nullable = true)
|-- is_editor_visible: boolean (nullable = true)
|-- is_minor_edit: boolean (nullable = true)
|-- rev_dt: string (nullable = false)
|-- rev_id: long (nullable = false)
|-- rev_parent_id: long (nullable = true)
|-- rev_sha1: string (nullable = true)
|-- rev_size: long (nullable = true)
-- wiki_id: string (nullable = false)
Alter statements:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `_schema` `_schema` STRING COMMENT 'A URI identifying the JSONSchema for this event. This should match an schema\'s $id in a schema repository. E.g. /schema/title/1.0.0
'
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `changelog_kind` `changelog_kind` STRING COMMENT 'The kind of this event in a changelog. This is used to map the event to an action in a data store.
'
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `comment` `comment` STRING COMMENT 'The comment left by the user that performed this change. Same as revision.comment on edits.
'
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `created_redirect_page` `created_redirect_page` STRUCT<`is_redirect`: BOOLEAN COMMENT 'True if the page is a redirect page at the time of this event.', `namespace_id`: BIGINT COMMENT 'The id of the namespace this page belongs to.', `page_id`: BIGINT COMMENT 'The (database) page ID of the page.', `page_title`: STRING COMMENT 'The normalized title of the page.', `revision_count`: BIGINT COMMENT 'NOTE: revision_count is never set for created_redirect_page. It is present here for backwards compatibility only.
'> COMMENT 'Page entity that was created at the old title during a page move. This is only set for page move events. Note that the created_redirect_page will also have its own associated page create event.
'
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `dt` `dt` STRING COMMENT 'ISO-8601 formatted timestamp of when the event occurred/was generated in UTC), AKA \'event time\'. This is different than meta.dt, which is used as the time the system received this event.
'
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `meta` `meta` STRUCT<`domain`: STRING COMMENT 'Domain the event or entity pertains to', `dt`: STRING COMMENT 'Time the event was received by the system, in UTC ISO-8601 format', `id`: STRING COMMENT 'Unique ID of this event', `request_id`: STRING COMMENT 'Unique ID of the request that caused the event', `stream`: STRING COMMENT 'Name of the stream (dataset) that this event belongs in', `uri`: STRING COMMENT 'Unique URI identifying the event or entity'>
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `page` `page` STRUCT<`is_redirect`: BOOLEAN COMMENT 'True if the page is a redirect page at the time of this event.', `namespace_id`: BIGINT COMMENT 'The id of the namespace this page belongs to.', `page_id`: BIGINT COMMENT 'The (database) page ID of the page.', `page_title`: STRING COMMENT 'The normalized title of the page.', `revision_count`: BIGINT COMMENT 'The number of revisions of this page at the time of this event. During a delete, this number of revisions will be archived. This field is likely only set for page delete events, as getting this information on all events is expensive.
', `redirect_page_link`: STRUCT<`interwiki_prefix`: STRING COMMENT 'The interwiki prefix (iw_prefix) of this link. The presence of this prefix implies a target outside the local wiki. See https://meta.wikimedia.org/wiki/Help:Interwiki_linking
', `is_redirect`: BOOLEAN COMMENT 'True if the page is a redirect page at the time of this event.', `namespace_id`: BIGINT COMMENT 'The id of the namespace this page belongs to.', `page_id`: BIGINT COMMENT 'The (database) page ID of the page.', `page_title`: STRING COMMENT 'The normalized title of the page.'> COMMENT 'If this page is currently a redirect, then this field contains information about the target page the redirect links to.
'> COMMENT 'Fields for MediaWiki page entity.'
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `page_change_kind` `page_change_kind` STRING COMMENT 'The origin kind of the change to this page as viewed by MediaWiki.
'
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `performer` `performer` STRUCT<`edit_count`: BIGINT COMMENT 'The number of edits this user has made at the time of this event. Not present for anonymous users.
', `groups`: ARRAY<STRING>, `is_bot`: BOOLEAN COMMENT 'True if this user is considered to be a bot at the time of this event. This is checked via the $user->isBot() method, which considers both user_groups and user permissions.
', `is_system`: BOOLEAN COMMENT 'True if the user is a MediaWiki \'system\' user. These are users that cannot \'authenticate\'. These are usually listed in ReservedUsernames.
', `is_temp`: BOOLEAN COMMENT 'True if the user is an autocreated temporary MediaWiki user. This is used for IP masking.
', `registration_dt`: STRING COMMENT 'The datetime of the user account registration. Not present for anonymous users or if missing in the MW database.
', `user_id`: BIGINT COMMENT 'The user ID that performed this change. This is optional, and will not be present for anonymous users.
', `user_text`: STRING COMMENT 'The user name or text representation of the user that performed this change.
'> COMMENT 'Represents the MediaWiki actor that made this change. If this change is an edit, this will be the same as revision.editor.
'
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `prior_state` `prior_state` STRUCT<`page`: STRUCT<`is_redirect`: BOOLEAN COMMENT 'True if the page is a redirect page at the time of this event.', `namespace_id`: BIGINT COMMENT 'The id of the namespace this page belongs to.', `page_id`: BIGINT COMMENT 'The (database) page ID of the page.', `page_title`: STRING COMMENT 'The normalized title of the page.', `revision_count`: BIGINT COMMENT 'NOTE: prior_state.page.revision_count is unlikely to be set, as getting the # of revisions previous to this change is difficult. This field is present here for backwards compatibiliy.
'> COMMENT 'Fields for MediaWiki page entity.', `revision`: STRUCT<`comment`: STRING COMMENT 'The comment left by the editor when this revision was made.', `content_slots`: MAP<STRING, STRUCT<`content_body`: STRING COMMENT 'Content body. NOTE: This field is not required, and is often not set in streams as it can make events very large. It is included here for events that do include the content body.
', `content_format`: STRING COMMENT 'The \'content type\' of the content. E.g. wikitext/html. This is similiar to a MIME type.', `content_model`: STRING COMMENT 'MediaWiki\'s content model of this content. E.g. wikitext, json, etc.', `content_sha1`: STRING COMMENT 'sha1 sum of the content body.', `content_size`: BIGINT COMMENT 'Byte size of the content body.', `origin_rev_id`: BIGINT COMMENT 'Revision in which this slot was originally created', `slot_role`: STRING COMMENT 'Slot role name.'>>, `editor`: STRUCT<`edit_count`: BIGINT COMMENT 'The number of edits this user has made at the time of this event. Not present for anonymous users.
', `groups`: ARRAY<STRING>, `is_bot`: BOOLEAN COMMENT 'True if this user is considered to be a bot at the time of this event. This is checked via the $user->isBot() method, which considers both user_groups and user permissions.
', `is_system`: BOOLEAN COMMENT 'True if the user is a MediaWiki \'system\' user. These are users that cannot \'authenticate\'. These are usually listed in ReservedUsernames.
', `is_temp`: BOOLEAN COMMENT 'True if the user is an autocreated temporary MediaWiki user. This is used for IP masking.
', `registration_dt`: STRING COMMENT 'The datetime of the user account registration. Not present for anonymous users or if missing in the MW database.
', `user_id`: BIGINT COMMENT 'The user ID that performed this change. This is optional, and will not be present for anonymous users.
', `user_text`: STRING COMMENT 'The user name or text representation of the user that performed this change.
'> COMMENT 'Represents the MediaWiki user that made this edit.', `is_comment_visible`: BOOLEAN COMMENT 'Whether the comment of the revision is visible. See RevisionRecord->DELETED_COMMENT.
', `is_content_visible`: BOOLEAN COMMENT 'Whether the revision\'s content body is visible. If this is false, then content should be redacted. See RevisionRecord->DELETED_TEXT
', `is_editor_visible`: BOOLEAN COMMENT 'Whether the revision\'s editor information is visible. Affects editor field. See RevisionRecord->DELETED_USER
', `is_minor_edit`: BOOLEAN COMMENT 'True if the editor marked this revision as a minor edit.', `rev_dt`: STRING COMMENT 'Time this revision was created. This is rev_timestamp in the MediaWiki database.
', `rev_id`: BIGINT COMMENT 'The (database) revision ID.', `rev_parent_id`: BIGINT COMMENT 'This revision\'s parent rev_id.', `rev_sha1`: STRING COMMENT 'sha1 sum considering all the content slots for this revision.
', `rev_size`: BIGINT COMMENT 'Byte size \'sum\' of all the content slots for this revision. This \'size\' is approximate, but may not be exact, depending on the kind of data that is stored in the content slots.
'> COMMENT 'Fields for MediaWiki revision entity.'> COMMENT 'Prior state of this page before this event. Fields are only present if their values have changed.
'
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `revision` `revision` STRUCT<`comment`: STRING COMMENT 'The comment left by the editor when this revision was made.', `content_slots`: MAP<STRING, STRUCT<`content_body`: STRING COMMENT 'Content body. NOTE: This field is not required, and is often not set in streams as it can make events very large. It is included here for events that do include the content body.
', `content_format`: STRING COMMENT 'The \'content type\' of the content. E.g. wikitext/html. This is similiar to a MIME type.', `content_model`: STRING COMMENT 'MediaWiki\'s content model of this content. E.g. wikitext, json, etc.', `content_sha1`: STRING COMMENT 'sha1 sum of the content body.', `content_size`: BIGINT COMMENT 'Byte size of the content body.', `origin_rev_id`: BIGINT COMMENT 'Revision in which this slot was originally created', `slot_role`: STRING COMMENT 'Slot role name.'>>, `editor`: STRUCT<`edit_count`: BIGINT COMMENT 'The number of edits this user has made at the time of this event. Not present for anonymous users.
', `groups`: ARRAY<STRING>, `is_bot`: BOOLEAN COMMENT 'True if this user is considered to be a bot at the time of this event. This is checked via the $user->isBot() method, which considers both user_groups and user permissions.
', `is_system`: BOOLEAN COMMENT 'True if the user is a MediaWiki \'system\' user. These are users that cannot \'authenticate\'. These are usually listed in ReservedUsernames.
', `is_temp`: BOOLEAN COMMENT 'True if the user is an autocreated temporary MediaWiki user. This is used for IP masking.
', `registration_dt`: STRING COMMENT 'The datetime of the user account registration. Not present for anonymous users or if missing in the MW database.
', `user_id`: BIGINT COMMENT 'The user ID that performed this change. This is optional, and will not be present for anonymous users.
', `user_text`: STRING COMMENT 'The user name or text representation of the user that performed this change.
'> COMMENT 'Represents the MediaWiki user that made this edit.', `is_comment_visible`: BOOLEAN COMMENT 'Whether the comment of the revision is visible. See RevisionRecord->DELETED_COMMENT.
', `is_content_visible`: BOOLEAN COMMENT 'Whether the revision\'s content body is visible. If this is false, then content should be redacted. See RevisionRecord->DELETED_TEXT
', `is_editor_visible`: BOOLEAN COMMENT 'Whether the revision\'s editor information is visible. Affects editor field. See RevisionRecord->DELETED_USER
', `is_minor_edit`: BOOLEAN COMMENT 'True if the editor marked this revision as a minor edit.', `rev_dt`: STRING COMMENT 'Time this revision was created. This is rev_timestamp in the MediaWiki database.
', `rev_id`: BIGINT COMMENT 'The (database) revision ID.', `rev_parent_id`: BIGINT COMMENT 'This revision\'s parent rev_id.', `rev_sha1`: STRING COMMENT 'sha1 sum considering all the content slots for this revision.
', `rev_size`: BIGINT COMMENT 'Byte size \'sum\' of all the content slots for this revision. This \'size\' is approximate, but may not be exact, depending on the kind of data that is stored in the content slots.
'> COMMENT 'Fields for MediaWiki revision entity.'
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `wiki_id` `wiki_id` STRING COMMENT 'The wiki ID, which is usually the same as the MediaWiki database name. E.g. enwiki, metawiki, etc.
'
24/01/02 21:49:53 INFO DataFrameToHive: Connecting to Hive over JDBC at jdbc:hive2://analytics-hive.eqiad.wmnet:10000/default;principal=hive/analytics-hive.eqiad.wmnet@WIKIMEDIA
24/01/02 21:49:53 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `_schema` `_schema` STRING COMMENT 'A URI identifying the JSONSchema for this event. This should match an schema\'s $id in a schema repository. E.g. /schema/title/1.0.0
'
24/01/02 21:49:53 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `changelog_kind` `changelog_kind` STRING COMMENT 'The kind of this event in a changelog. This is used to map the event to an action in a data store.
'
24/01/02 21:49:54 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `comment` `comment` STRING COMMENT 'The comment left by the user that performed this change. Same as revision.comment on edits.
'
24/01/02 21:49:54 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `created_redirect_page` `created_redirect_page` STRUCT<`is_redirect`: BOOLEAN COMMENT 'True if the page is a redirect page at the time of this event.', `namespace_id`: BIGINT COMMENT 'The id of the namespace this page belongs to.', `page_id`: BIGINT COMMENT 'The (database) page ID of the page.', `page_title`: STRING COMMENT 'The normalized title of the page.', `revision_count`: BIGINT COMMENT 'NOTE: revision_count is never set for created_redirect_page. It is present here for backwards compatibility only.
'> COMMENT 'Page entity that was created at the old title during a page move. This is only set for page move events. Note that the created_redirect_page will also have its own associated page create event.
'
24/01/02 21:49:54 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `dt` `dt` STRING COMMENT 'ISO-8601 formatted timestamp of when the event occurred/was generated in UTC), AKA \'event time\'. This is different than meta.dt, which is used as the time the system received this event.
'
24/01/02 21:49:54 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `meta` `meta` STRUCT<`domain`: STRING COMMENT 'Domain the event or entity pertains to', `dt`: STRING COMMENT 'Time the event was received by the system, in UTC ISO-8601 format', `id`: STRING COMMENT 'Unique ID of this event', `request_id`: STRING COMMENT 'Unique ID of the request that caused the event', `stream`: STRING COMMENT 'Name of the stream (dataset) that this event belongs in', `uri`: STRING COMMENT 'Unique URI identifying the event or entity'>
24/01/02 21:49:54 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `page` `page` STRUCT<`is_redirect`: BOOLEAN COMMENT 'True if the page is a redirect page at the time of this event.', `namespace_id`: BIGINT COMMENT 'The id of the namespace this page belongs to.', `page_id`: BIGINT COMMENT 'The (database) page ID of the page.', `page_title`: STRING COMMENT 'The normalized title of the page.', `revision_count`: BIGINT COMMENT 'The number of revisions of this page at the time of this event. During a delete, this number of revisions will be archived. This field is likely only set for page delete events, as getting this information on all events is expensive.
', `redirect_page_link`: STRUCT<`interwiki_prefix`: STRING COMMENT 'The interwiki prefix (iw_prefix) of this link. The presence of this prefix implies a target outside the local wiki. See https://meta.wikimedia.org/wiki/Help:Interwiki_linking
', `is_redirect`: BOOLEAN COMMENT 'True if the page is a redirect page at the time of this event.', `namespace_id`: BIGINT COMMENT 'The id of the namespace this page belongs to.', `page_id`: BIGINT COMMENT 'The (database) page ID of the page.', `page_title`: STRING COMMENT 'The normalized title of the page.'> COMMENT 'If this page is currently a redirect, then this field contains information about the target page the redirect links to.
'> COMMENT 'Fields for MediaWiki page entity.'
24/01/02 21:49:54 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `page_change_kind` `page_change_kind` STRING COMMENT 'The origin kind of the change to this page as viewed by MediaWiki.
'
24/01/02 21:49:54 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `performer` `performer` STRUCT<`edit_count`: BIGINT COMMENT 'The number of edits this user has made at the time of this event. Not present for anonymous users.
', `groups`: ARRAY<STRING>, `is_bot`: BOOLEAN COMMENT 'True if this user is considered to be a bot at the time of this event. This is checked via the $user->isBot() method, which considers both user_groups and user permissions.
', `is_system`: BOOLEAN COMMENT 'True if the user is a MediaWiki \'system\' user. These are users that cannot \'authenticate\'. These are usually listed in ReservedUsernames.
', `is_temp`: BOOLEAN COMMENT 'True if the user is an autocreated temporary MediaWiki user. This is used for IP masking.
', `registration_dt`: STRING COMMENT 'The datetime of the user account registration. Not present for anonymous users or if missing in the MW database.
', `user_id`: BIGINT COMMENT 'The user ID that performed this change. This is optional, and will not be present for anonymous users.
', `user_text`: STRING COMMENT 'The user name or text representation of the user that performed this change.
'> COMMENT 'Represents the MediaWiki actor that made this change. If this change is an edit, this will be the same as revision.editor.
'
24/01/02 21:49:54 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `prior_state` `prior_state` STRUCT<`page`: STRUCT<`is_redirect`: BOOLEAN COMMENT 'True if the page is a redirect page at the time of this event.', `namespace_id`: BIGINT COMMENT 'The id of the namespace this page belongs to.', `page_id`: BIGINT COMMENT 'The (database) page ID of the page.', `page_title`: STRING COMMENT 'The normalized title of the page.', `revision_count`: BIGINT COMMENT 'NOTE: prior_state.page.revision_count is unlikely to be set, as getting the # of revisions previous to this change is difficult. This field is present here for backwards compatibiliy.
'> COMMENT 'Fields for MediaWiki page entity.', `revision`: STRUCT<`comment`: STRING COMMENT 'The comment left by the editor when this revision was made.', `content_slots`: MAP<STRING, STRUCT<`content_body`: STRING COMMENT 'Content body. NOTE: This field is not required, and is often not set in streams as it can make events very large. It is included here for events that do include the content body.
', `content_format`: STRING COMMENT 'The \'content type\' of the content. E.g. wikitext/html. This is similiar to a MIME type.', `content_model`: STRING COMMENT 'MediaWiki\'s content model of this content. E.g. wikitext, json, etc.', `content_sha1`: STRING COMMENT 'sha1 sum of the content body.', `content_size`: BIGINT COMMENT 'Byte size of the content body.', `origin_rev_id`: BIGINT COMMENT 'Revision in which this slot was originally created', `slot_role`: STRING COMMENT 'Slot role name.'>>, `editor`: STRUCT<`edit_count`: BIGINT COMMENT 'The number of edits this user has made at the time of this event. Not present for anonymous users.
', `groups`: ARRAY<STRING>, `is_bot`: BOOLEAN COMMENT 'True if this user is considered to be a bot at the time of this event. This is checked via the $user->isBot() method, which considers both user_groups and user permissions.
', `is_system`: BOOLEAN COMMENT 'True if the user is a MediaWiki \'system\' user. These are users that cannot \'authenticate\'. These are usually listed in ReservedUsernames.
', `is_temp`: BOOLEAN COMMENT 'True if the user is an autocreated temporary MediaWiki user. This is used for IP masking.
', `registration_dt`: STRING COMMENT 'The datetime of the user account registration. Not present for anonymous users or if missing in the MW database.
', `user_id`: BIGINT COMMENT 'The user ID that performed this change. This is optional, and will not be present for anonymous users.
', `user_text`: STRING COMMENT 'The user name or text representation of the user that performed this change.
'> COMMENT 'Represents the MediaWiki user that made this edit.', `is_comment_visible`: BOOLEAN COMMENT 'Whether the comment of the revision is visible. See RevisionRecord->DELETED_COMMENT.
', `is_content_visible`: BOOLEAN COMMENT 'Whether the revision\'s content body is visible. If this is false, then content should be redacted. See RevisionRecord->DELETED_TEXT
', `is_editor_visible`: BOOLEAN COMMENT 'Whether the revision\'s editor information is visible. Affects editor field. See RevisionRecord->DELETED_USER
', `is_minor_edit`: BOOLEAN COMMENT 'True if the editor marked this revision as a minor edit.', `rev_dt`: STRING COMMENT 'Time this revision was created. This is rev_timestamp in the MediaWiki database.
', `rev_id`: BIGINT COMMENT 'The (database) revision ID.', `rev_parent_id`: BIGINT COMMENT 'This revision\'s parent rev_id.', `rev_sha1`: STRING COMMENT 'sha1 sum considering all the content slots for this revision.
', `rev_size`: BIGINT COMMENT 'Byte size \'sum\' of all the content slots for this revision. This \'size\' is approximate, but may not be exact, depending on the kind of data that is stored in the content slots.
'> COMMENT 'Fields for MediaWiki revision entity.'> COMMENT 'Prior state of this page before this event. Fields are only present if their values have changed.
'
24/01/02 21:49:55 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `revision` `revision` STRUCT<`comment`: STRING COMMENT 'The comment left by the editor when this revision was made.', `content_slots`: MAP<STRING, STRUCT<`content_body`: STRING COMMENT 'Content body. NOTE: This field is not required, and is often not set in streams as it can make events very large. It is included here for events that do include the content body.
', `content_format`: STRING COMMENT 'The \'content type\' of the content. E.g. wikitext/html. This is similiar to a MIME type.', `content_model`: STRING COMMENT 'MediaWiki\'s content model of this content. E.g. wikitext, json, etc.', `content_sha1`: STRING COMMENT 'sha1 sum of the content body.', `content_size`: BIGINT COMMENT 'Byte size of the content body.', `origin_rev_id`: BIGINT COMMENT 'Revision in which this slot was originally created', `slot_role`: STRING COMMENT 'Slot role name.'>>, `editor`: STRUCT<`edit_count`: BIGINT COMMENT 'The number of edits this user has made at the time of this event. Not present for anonymous users.
', `groups`: ARRAY<STRING>, `is_bot`: BOOLEAN COMMENT 'True if this user is considered to be a bot at the time of this event. This is checked via the $user->isBot() method, which considers both user_groups and user permissions.
', `is_system`: BOOLEAN COMMENT 'True if the user is a MediaWiki \'system\' user. These are users that cannot \'authenticate\'. These are usually listed in ReservedUsernames.
', `is_temp`: BOOLEAN COMMENT 'True if the user is an autocreated temporary MediaWiki user. This is used for IP masking.
', `registration_dt`: STRING COMMENT 'The datetime of the user account registration. Not present for anonymous users or if missing in the MW database.
', `user_id`: BIGINT COMMENT 'The user ID that performed this change. This is optional, and will not be present for anonymous users.
', `user_text`: STRING COMMENT 'The user name or text representation of the user that performed this change.
'> COMMENT 'Represents the MediaWiki user that made this edit.', `is_comment_visible`: BOOLEAN COMMENT 'Whether the comment of the revision is visible. See RevisionRecord->DELETED_COMMENT.
', `is_content_visible`: BOOLEAN COMMENT 'Whether the revision\'s content body is visible. If this is false, then content should be redacted. See RevisionRecord->DELETED_TEXT
', `is_editor_visible`: BOOLEAN COMMENT 'Whether the revision\'s editor information is visible. Affects editor field. See RevisionRecord->DELETED_USER
', `is_minor_edit`: BOOLEAN COMMENT 'True if the editor marked this revision as a minor edit.', `rev_dt`: STRING COMMENT 'Time this revision was created. This is rev_timestamp in the MediaWiki database.
', `rev_id`: BIGINT COMMENT 'The (database) revision ID.', `rev_parent_id`: BIGINT COMMENT 'This revision\'s parent rev_id.', `rev_sha1`: STRING COMMENT 'sha1 sum considering all the content slots for this revision.
', `rev_size`: BIGINT COMMENT 'Byte size \'sum\' of all the content slots for this revision. This \'size\' is approximate, but may not be exact, depending on the kind of data that is stored in the content slots.
'> COMMENT 'Fields for MediaWiki revision entity.'
24/01/02 21:49:55 INFO DataFrameToHive: Running Hive DDL statement:
ALTER TABLE otto.mw_page_change0
CHANGE COLUMN `wiki_id` `wiki_id` STRING COMMENT 'The wiki ID, which is usually the same as the MediaWiki database name. E.g. enwiki, metawiki, etc.
'
24/01/02 21:49:55 INFO EvolveHiveTable: Altered Hive table otto.mw_page_change0.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment