stIncMale/largest.sql

## largest.sql
with
  new_event_tree_model_lob as (--the format of event_tree.model_lob is specified below
    select
    event_tree_id,
    string_agg(
      array_to_string(
        model_attr_array[:1] ||--a slice of the array with indices <=1, we need an array type here instead of an element of the array
        check_not_null(--if there is no matching file then model_lob refers to a file that does not exist, which means the data is corrupted
          files.new_id,
          array['files for event_tree', event_tree_id::text, 'model_idx', model_idx::text] || 'model_attr_array'::text || model_attr_array)::text ||
        model_attr_array[3] ||
        check_not_null(
          stream_function.new_id,
          array['stream_function for event_tree', event_tree_id::text, 'model_idx', model_idx::text] || 'model_attr_array'::text || model_attr_array)::text ||
        check_not_null(
          function_metrics.new_id,
          array['function_metrics for event_tree', event_tree_id::text, 'model_idx', model_idx::text] || 'model_attr_array'::text || model_attr_array)::text,
        ','
      ),
      ';' order by model_idx
    ) as new_model_lob
    from
      (select id as event_tree_id, model_idx, string_to_array(model, ',') as model_attr_array
      from
        (values (1001, '0,111,1,113,114;0,121,1,123,124'), (1002, '0,211,1,213,214')) as event_tree (id, model_lob)
        cross join unnest(string_to_array(model_lob, ';')) with ordinality as model (model, model_idx)--a table containing each model from model_lob in its own row
      ) as parsed_model_lob--a table containing array of attributes representing each model (an array of attributes) from model_lob in its own row
      --we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using check_not_null
      left join (values (111, -111), (121, -121), (211, -211)) as files (id, new_id) on model_attr_array[2]::bigint = files.id
      left join (values (113, -113), (123, -123), (213, -213)) as stream_function (id, new_id) on model_attr_array[4]::bigint = stream_function.id
      left join (values (114, -114), (124, -124), (214, -214)) as function_metrics (id, new_id) on model_attr_array[5]::bigint = function_metrics.id
    group by event_tree_id
  ),
  new_event_tree_event_lob as (--the format of event_tree.event_lob is specified below
    with
      parsed_event_lob_parsed_new_attr11 as (--same as parsed_event_lob_parsed_attr11 but with updated value of 11th attr
        select
          event_tree_id,
          event_idx,
          /* There is only a single row for each event_tree_id and event_idx
           * that has non-null event_attr_array. We are using max aggregate function to extract that non-null value instead of
           * having the value in all rows and group by not only event_tree_id and event_idx
           * but also event_attr_array which would have been more resource-consuming.
           *
           * max is an immutable function, so PostgreSQL can (and most likely does) calculate the result only once per selected row,
           * thus we should not worry about "calling" the function multiple times per row with the same arguments.*/
          case when (max(event_attr_array)::text[])[1] = 'S'
            then (max(event_attr_array)::text[])[:10] || string_agg(new_element, '&' order by elmnt_idx) || (max(event_attr_array)::text[])[12:]
            else max(event_attr_array)
          end event_attr_array_new_attr11
        from
          (with
            parsed_event_lob_parsed_attr11 as (
              select
                event_tree_id, event_idx,
                /* Select array only for the first element.
                 * This allows us to group by event_idx instead of grouping by both event_idx and event_attr_array_no_attr11 thus reduces costs
                 * and also potentially reduces memory / temporary storage consumption.*/
                case
                  when elmnt_idx = 1 then event_attr_array
                  else null
                end as event_attr_array,
                elmnt_idx,
                string_to_array(elmnt, ':') as element_value_array
              from
                (select
                  event_tree_id,
                  event_idx,
                  event_attr_array,
                  string_to_array(
                    case
                      when event_attr_array[1] = 'S' then
                        case
                          when event_attr_array[11] = '' then 'null_attr11'--'null_attr11' is a special dummy value for empty 11th attr to make sure we are not loosing it
                          else event_attr_array[11]
                        end
                      /* Have to use a string and later specify it as a null-string in string_to_array
                       * because otherwise string_to_array produces an array with no elements which when unnested results in a table with no rows,
                       * and cross join (which is also used when no join type is specified) with such a table also produces no rows, which leads to us losing data.*/
                      else 'null'
                    end,
                    '&', 'null'--specify null-string as mentioned above
                  ) as event_attr11_element_array
                from
                  (select id as event_tree_id, event_idx, string_to_array(event, ',') as event_attr_array
                  from
                    (values (1001, 'C,111,112,0~0,0,-1,0,113,-1,0;P,111,112,0~0,0,-1,114,0,0,0,0'), (1002, 'G,211,212,0~0,0,-1;S,221,222,0~0,0,-1,0,-1,-1,0,,-;S,221,222,0~0,0,-1,0,-1,-1,0,com/File1.java:123:com.Cls.function1:223&com/File2.java:231:com.Cls.function2:224&com/File3.java:312:com.Cls.function3:225&/java/lang/Thread.java:-1:java.lang.Thread.run:-1,-'), (1003, 'R,211,212,0~0,0,-1,-')) as event_tree (id, event_lob)
                    cross join unnest(string_to_array(event_lob, ';')) with ordinality as event (event, event_idx)--a table containing each event from event_lob in its own row
                  where event_tree.id != 1--event tree 1 is an invalid dummy entry
                  ) as parsed_event_lob--a table containing array of attributes representing each event from event_lob in its own row
                ) as parsed_event_lob_preparsed_attr11--a table containing array of attributes representing each event from event_lob in its own row with 11th attr extracted in a separate column and parsed into an array of elements
                cross join unnest(event_attr11_element_array) with ordinality as elmnt (elmnt, elmnt_idx)
            )
          select--a table with updated element_value_array concatenated into string
            event_tree_id,
            event_idx,
            event_attr_array,
            elmnt_idx,
            case
              when element_value_array[1] = 'null_attr11' then ''
              else array_to_string(
                element_value_array[:3] ||
                case
                  when element_value_array[4] != '-1'--file instance is supposed to be in DB
                  then default_if_null(--if there is no matching file instance then event_lob refers to a file instance that does not exist, which means the data may be corrupted; turned out sometimes the data we process is corrupted
                    file_instance.new_id, -1,
                    array['file_instance for event_tree ', event_tree_id::text, 'event_idx', event_idx::text, 'elmnt_idx=', elmnt_idx::text] || 'element_value_array'::text || element_value_array)::text
                  else '-1'
                end,
              ':')
            end as new_element
          from
            parsed_event_lob_parsed_attr11
            --we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using check_not_null
            left join (values (223, -223), (224, -224), (225, -225)) as file_instance (id, new_id) on element_value_array[4]::bigint = file_instance.id
          ) parsed_event_lob_parsed_new_attr11--same as parsed_event_lob_parsed_attr11 but with updated value of 11th attr
        group by event_tree_id, event_idx
      )
    select
    event_tree_id,
    string_agg(
      array_to_string(
        event_attr_array_new_attr11[:1] ||--a slice of the array with indices <=1, we need an array type here instead of an element of the array
        check_not_null(--if there is no matching event tag then event_lob refers to an event tag that does not exist, which means the data is corrupted
          event_tag.new_id,
          array['event_tag for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text ||
        check_not_null(
          event_description.new_id,
          array['event_description for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text ||
        case
          when event_attr_array_new_attr11[1] = 'C' or event_attr_array_new_attr11[1] = 'S' then
            event_attr_array_new_attr11[4:7] ||
            case
              when event_attr_array_new_attr11[8] != '-1'--file is supposed to be in DB
              then check_not_null(
                files.new_id,
                array['files for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text
              else '-1'
            end ||
            event_attr_array_new_attr11[9:12]
          when event_attr_array_new_attr11[1] = 'P' then
            event_attr_array_new_attr11[4:6] ||
            case
              when event_attr_array_new_attr11[7] != '-1'--path_condition is supposed to be in DB
              then check_not_null(
                path_condition.new_id,
                array['path_condition for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text
              else '-1'
            end ||
            event_attr_array_new_attr11[8:11]
          else
            event_attr_array_new_attr11[4:7]--it is [4:7] for 'R' and [4:6] for 'G', but array_to_string skips null (and therefore non-existing) array elements
        end,
        ','
      ),
      ';' order by event_idx
    ) as new_event_lob
    from
      parsed_event_lob_parsed_new_attr11
      --we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using check_not_null
      left join (values (111, -111), (211, -211), (221, -221)) as event_tag (id, new_id) on event_attr_array_new_attr11[2]::bigint = event_tag.id
      left join (values (112, -112), (212, -212), (222, -222)) as event_description (id, new_id) on event_attr_array_new_attr11[3]::bigint = event_description.id
      left join (values (113, -113)) as files (id, new_id) on
         /* PostgreSQL does not have left-to-right short-circuiting of boolean operators (see https://www.postgresql.org/docs/10/sql-expressions.html),
          * so we have to check if the string represents an integer before casting it.*/
        (event_attr_array_new_attr11[1] = 'C' or event_attr_array_new_attr11[1] = 'S')
        and (case when event_attr_array_new_attr11[8] ~ '^-?\d+$' then event_attr_array_new_attr11[8]::bigint else null end) = files.id
      left join (values (114, -114)) as path_condition (id, new_id) on
        /* This is equivalent to
         *   case when event_attr_array_new_attr11[1] = 'P'
         *     then event_attr_array_new_attr11[7]::bigint = path_condition.id
         *     else false
         *   end
         * but the execution plan for the CASE variant is much worse. See the execution plans comparison below[1].*/
        event_attr_array_new_attr11[1] = 'P'
        and (case when event_attr_array_new_attr11[7] ~ '^-?\d+$' then event_attr_array_new_attr11[7]::bigint else null end) = path_condition.id
    group by event_tree_id
  )
select
  event_tree_id,
  new_model_lob,
  new_event_lob,
  /* PostgreSQL md5 function accepts a string instead of bytes, which means its behaviour depends on server_encoding
   * (use "show server_encoding;" to query it).
   * Since the application uses UTF-8 charset for converting a string into bytes,
   * we can use PostgreSQL md5 function only if server_encoding is UTF-8.
   * Fortunately, we require using UTF-8, so we are all good.*/
  md5(coalesce(new_model_lob, '') || '|' || coalesce(new_event_lob, '')) as events_md5
from
  new_event_tree_model_lob
  full join new_event_tree_event_lob using (event_tree_id);

-------------------------------------------------------------------------------------------------------------------------
--these functions are used by the query above
create or replace function /*pg_temp.*/check_not_null(
  v bigint,
  debug_info text[])
returns bigint
language plpgsql immutable parallel safe
as $$
  begin
    if v is null then
      raise exception 'The argument must not be null. Debug info: %', debug_info;
    end if;
    return v;
  end;
$$;

create or replace function /*pg_temp.*/default_if_null(
  v bigint,
  dflt bigint,
  debug_info text[])
returns bigint
language plpgsql immutable parallel safe
as $$
  begin
    if v is null then
      raise warning 'The argument was null and %s was used instead. Debug info: %', dflt, debug_info;
      return dflt;
    end if;
    return v;
  end;
$$;

/*
 * event_tree.mode_lob and event_tree.event_lob contain contain textual data which refers to rows in other tables.
 * It is similar to having FK constraints, but there is no DBMS support. The task is to update parts
 * that refer to rows in other tables with the new identifiers for those rows (the IDs are changed when data is imported).
 * Given that the structure of event_tree.event_lob is far from trivial, doing this in SQL is ugly and not easy,
 * but the alternative (download all the data piece by piece and process it in Java by making more requests to DBMS)
 * is likely not a viable solution performance-wise.
 *
 * The strings are parsed by
 * - splitting them into arrays;
 * - unnesting arrays into tables;
 * - updating elements of arrays (which are arrays on their own) that require updates;
 * - aggregating unnested array elements back into arrays;
 * - joining arrays back into strings.
 * Note that event_tree.mode_lob effectively contains an array of arrays,
 * while event_tree.event_lob may contain array of arrays of arrays (i.e. double nesting)
 * which makes the job of updating event_tree.event_lob much more difficult.
 *
 * Note that indices of PostgreSQL arrays are 1-based
 * and DBMS allows them to go out of bounds of an array in which case the value is considered null.
 *
 * Originally this was an update command, but it was changed to be a select and
 * real tables were replaced with fake ones produced by using values lists, so the query could be run immediately anywhere.
 * Despite being bulky, the original update command was able to process 400_000 rows under 30 seconds.
 *
 *
 * The format of event_tree.mode_lob and event_tree.event_lob.
 * Legend:
 *   [attrIdx] - the index of an attribute.
 *   FK -> ck - the attribute is conceptually a foreign key that references the candidate key ck (which in our case always happens to be a primary key).
 *
 * event_tree.model_lob format:
 *   models are separated by ';'
 *   pieces of model data (attributes) are separated by ','
 *   attributes:
 *     [1] irrelevant
 *     [2] FK -> files.id
 *     [3] irrelevant
 *     [4] FK -> stream_function.id
 *     [5] FK -> function_metrics.id
 *
 * event_tree.event_lob format:
 *   Attributes of events depend on the event type. Event types are hierarchical, e.g., event type "S",
 *     is a subtype of "C". This means that "S" events have all the attributes of "C" events,
 *     namely 0 - 9 (of which attributes 0-5 are present on events of any type), but additionally have attributes 10 and 11.
 *     The attribute 10 is a set of stack trace elements, each of which itself have 4 attributes.
 *   events are separated by ';'
 *   pieces of event data (attributes) are separated by ','
 *   attributes common for all event types:
 *     [0] event type (one of C, S, P, R, G),
 *     [1] FK -> event_tag.id,
 *     [2] FK -> event_description.id,
 *     [3] irrelevant
 *     [4] irrelevant
 *     [5] irrelevant
 *   "C"-specific attributes:
 *     [6] irrelevant
 *     [7] FK -> files.id//can be -1 which means "no value"
 *     [8] irrelevant
 *     [9] irrelevant
 *     "S"-specific attributes:
 *       [10] elements
 *         format:
 *           elements are separated by '&'
 *           pieces of element data (values) are separated by ':'
 *           values:
 *             [0] irrelevant
 *             [1] irrelevant
 *             [2] irrelevant
 *             [3] FK -> file_instance.id//can be -1 which means "no value"
 *       [11] irrelevant
 *   "P"-specific attributes:
 *     [6] FK -> path_condition.id//can be -1 which means "no value"
 *     [7] irrelevant
 *     [8] irrelevant
 *     [9] irrelevant
 *     [10] irrelevant
 *   "R"-specific attributes:
 *     [6] irrelevant
 *   "G"-specific attributes:
 *     none
 */

/*
 * [1] The execution plans comparison.
 *
 * ->  Nested Loop Left Join (cost=4051.85..1183168.20 rows=21400000 width=80)
 *       Join Filter: CASE WHEN (parsed_event_lob_parsed_new_attr11.event_attr_array_new_attr11[1] = 'P'::text)
 *                      THEN ((parsed_event_lob_parsed_new_attr11.event_attr_array_new_attr11[7])::bigint = path_condition.id)
 *                      ELSE false
 *                    END
 *       ->  Nested Loop Left Join (cost=4051.85..220144.82 rows=40000 width=72)
 *             ...<a plan for obtaining result_of_previous_operations is omitted>
 *       ->  Materialize (cost=0.00..26.05 rows=1070 width=16)
 *             ->  Seq Scan on path_condition (cost=0.00..20.70 rows=1070 width=16)
 * We can see that result_of_previous_operations is estimated to have 40000 rows and PG uses nested loop to join it with 1070 rows from path_condition.
 * This results in about 40000 * 1070 / 2 = 21400000 comparisons (on average the inner loop will have 1070 / 2 iterations for each iteration of the outer loop).
 * So the CASE plan has time complexity O(rows(result_of_previous_operations) * rows(path_condition)), which is quadratic.
 *
 * ->  Hash Left Join (cost=4085.92..220298.90 rows=40000 width=80)
 *       Hash Cond: (CASE WHEN (parsed_event_lob_parsed_new_attr11.event_attr_array_new_attr11[7] ~ '^-?\d+$'::text)
 *                    THEN (parsed_event_lob_parsed_new_attr11.event_attr_array_new_attr11[7])::bigint
 *                    ELSE NULL::bigint
 *                  END = path_condition.id)
 *       Join Filter: (parsed_event_lob_parsed_new_attr11.event_attr_array_new_attr11[1] = 'P'::text)
 *       ->  Nested Loop Left Join (cost=4051.85..220144.82 rows=40000 width=72)
 *             ...<a plan for obtaining result_of_previous_operations is omitted>
 *       ->  Hash (cost=20.70..20.70 rows=1070 width=16)
 *             ->  Seq Scan on path_condition (cost=0.00..20.70 rows=1070 width=16)
 * Now the plan is to build a hash table for 1070 rows in path_condition (which is of linear time complexity) and use hash join to join this table with 40000 rows estimated for result_of_previous_operations.
 * This approach results in 40000 hash table lookups, compared to 21400000 comparisons in the original plan.
 * The time complexity for the new join plan is O(rows(result_of_previous_operations) + rows(path_condition)), i.e. linear instead of being quadratic.
 */
	with
	new_event_tree_model_lob as (--the format of event_tree.model_lob is specified below
	select
	event_tree_id,
	string_agg(
	array_to_string(
	model_attr_array[:1] \|\|--a slice of the array with indices <=1, we need an array type here instead of an element of the array
	check_not_null(--if there is no matching file then model_lob refers to a file that does not exist, which means the data is corrupted
	files.new_id,
	array['files for event_tree', event_tree_id::text, 'model_idx', model_idx::text] \|\| 'model_attr_array'::text \|\| model_attr_array)::text \|\|
	model_attr_array[3] \|\|
	check_not_null(
	stream_function.new_id,
	array['stream_function for event_tree', event_tree_id::text, 'model_idx', model_idx::text] \|\| 'model_attr_array'::text \|\| model_attr_array)::text \|\|
	check_not_null(
	function_metrics.new_id,
	array['function_metrics for event_tree', event_tree_id::text, 'model_idx', model_idx::text] \|\| 'model_attr_array'::text \|\| model_attr_array)::text,
	','
	),
	';' order by model_idx
	) as new_model_lob
	from
	(select id as event_tree_id, model_idx, string_to_array(model, ',') as model_attr_array
	from
	(values (1001, '0,111,1,113,114;0,121,1,123,124'), (1002, '0,211,1,213,214')) as event_tree (id, model_lob)
	cross join unnest(string_to_array(model_lob, ';')) with ordinality as model (model, model_idx)--a table containing each model from model_lob in its own row
	) as parsed_model_lob--a table containing array of attributes representing each model (an array of attributes) from model_lob in its own row
	--we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using check_not_null
	left join (values (111, -111), (121, -121), (211, -211)) as files (id, new_id) on model_attr_array[2]::bigint = files.id
	left join (values (113, -113), (123, -123), (213, -213)) as stream_function (id, new_id) on model_attr_array[4]::bigint = stream_function.id
	left join (values (114, -114), (124, -124), (214, -214)) as function_metrics (id, new_id) on model_attr_array[5]::bigint = function_metrics.id
	group by event_tree_id
	),
	new_event_tree_event_lob as (--the format of event_tree.event_lob is specified below
	with
	parsed_event_lob_parsed_new_attr11 as (--same as parsed_event_lob_parsed_attr11 but with updated value of 11th attr
	select
	event_tree_id,
	event_idx,
	/* There is only a single row for each event_tree_id and event_idx
	* that has non-null event_attr_array. We are using max aggregate function to extract that non-null value instead of
	* having the value in all rows and group by not only event_tree_id and event_idx
	* but also event_attr_array which would have been more resource-consuming.
	*
	* max is an immutable function, so PostgreSQL can (and most likely does) calculate the result only once per selected row,
	* thus we should not worry about "calling" the function multiple times per row with the same arguments.*/
	case when (max(event_attr_array)::text[])[1] = 'S'
	then (max(event_attr_array)::text[])[:10] \|\| string_agg(new_element, '&' order by elmnt_idx) \|\| (max(event_attr_array)::text[])[12:]
	else max(event_attr_array)
	end event_attr_array_new_attr11
	from
	(with
	parsed_event_lob_parsed_attr11 as (
	select
	event_tree_id, event_idx,
	/* Select array only for the first element.
	* This allows us to group by event_idx instead of grouping by both event_idx and event_attr_array_no_attr11 thus reduces costs
	* and also potentially reduces memory / temporary storage consumption.*/
	case
	when elmnt_idx = 1 then event_attr_array
	else null
	end as event_attr_array,
	elmnt_idx,
	string_to_array(elmnt, ':') as element_value_array
	from
	(select
	event_tree_id,
	event_idx,
	event_attr_array,
	string_to_array(
	case
	when event_attr_array[1] = 'S' then
	case
	when event_attr_array[11] = '' then 'null_attr11'--'null_attr11' is a special dummy value for empty 11th attr to make sure we are not loosing it
	else event_attr_array[11]
	end
	/* Have to use a string and later specify it as a null-string in string_to_array
	* because otherwise string_to_array produces an array with no elements which when unnested results in a table with no rows,
	* and cross join (which is also used when no join type is specified) with such a table also produces no rows, which leads to us losing data.*/
	else 'null'
	end,
	'&', 'null'--specify null-string as mentioned above
	) as event_attr11_element_array
	from
	(select id as event_tree_id, event_idx, string_to_array(event, ',') as event_attr_array
	from
	(values (1001, 'C,111,112,0~0,0,-1,0,113,-1,0;P,111,112,0~0,0,-1,114,0,0,0,0'), (1002, 'G,211,212,0~0,0,-1;S,221,222,0~0,0,-1,0,-1,-1,0,,-;S,221,222,0~0,0,-1,0,-1,-1,0,com/File1.java:123:com.Cls.function1:223&com/File2.java:231:com.Cls.function2:224&com/File3.java:312:com.Cls.function3:225&/java/lang/Thread.java:-1:java.lang.Thread.run:-1,-'), (1003, 'R,211,212,0~0,0,-1,-')) as event_tree (id, event_lob)
	cross join unnest(string_to_array(event_lob, ';')) with ordinality as event (event, event_idx)--a table containing each event from event_lob in its own row
	where event_tree.id != 1--event tree 1 is an invalid dummy entry
	) as parsed_event_lob--a table containing array of attributes representing each event from event_lob in its own row
	) as parsed_event_lob_preparsed_attr11--a table containing array of attributes representing each event from event_lob in its own row with 11th attr extracted in a separate column and parsed into an array of elements
	cross join unnest(event_attr11_element_array) with ordinality as elmnt (elmnt, elmnt_idx)
	)
	select--a table with updated element_value_array concatenated into string
	event_tree_id,
	event_idx,
	event_attr_array,
	elmnt_idx,
	case
	when element_value_array[1] = 'null_attr11' then ''
	else array_to_string(
	element_value_array[:3] \|\|
	case
	when element_value_array[4] != '-1'--file instance is supposed to be in DB
	then default_if_null(--if there is no matching file instance then event_lob refers to a file instance that does not exist, which means the data may be corrupted; turned out sometimes the data we process is corrupted
	file_instance.new_id, -1,
	array['file_instance for event_tree ', event_tree_id::text, 'event_idx', event_idx::text, 'elmnt_idx=', elmnt_idx::text] \|\| 'element_value_array'::text \|\| element_value_array)::text
	else '-1'
	end,
	':')
	end as new_element
	from
	parsed_event_lob_parsed_attr11
	--we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using check_not_null
	left join (values (223, -223), (224, -224), (225, -225)) as file_instance (id, new_id) on element_value_array[4]::bigint = file_instance.id
	) parsed_event_lob_parsed_new_attr11--same as parsed_event_lob_parsed_attr11 but with updated value of 11th attr
	group by event_tree_id, event_idx
	)
	select
	event_tree_id,
	string_agg(
	array_to_string(
	event_attr_array_new_attr11[:1] \|\|--a slice of the array with indices <=1, we need an array type here instead of an element of the array
	check_not_null(--if there is no matching event tag then event_lob refers to an event tag that does not exist, which means the data is corrupted
	event_tag.new_id,
	array['event_tag for event_tree', event_tree_id::text, 'event_idx', event_idx::text] \|\| 'event_attr_array_new_attr11'::text \|\| event_attr_array_new_attr11)::text \|\|
	check_not_null(
	event_description.new_id,
	array['event_description for event_tree', event_tree_id::text, 'event_idx', event_idx::text] \|\| 'event_attr_array_new_attr11'::text \|\| event_attr_array_new_attr11)::text \|\|
	case
	when event_attr_array_new_attr11[1] = 'C' or event_attr_array_new_attr11[1] = 'S' then
	event_attr_array_new_attr11[4:7] \|\|
	case
	when event_attr_array_new_attr11[8] != '-1'--file is supposed to be in DB
	then check_not_null(
	files.new_id,
	array['files for event_tree', event_tree_id::text, 'event_idx', event_idx::text] \|\| 'event_attr_array_new_attr11'::text \|\| event_attr_array_new_attr11)::text
	else '-1'
	end \|\|
	event_attr_array_new_attr11[9:12]
	when event_attr_array_new_attr11[1] = 'P' then
	event_attr_array_new_attr11[4:6] \|\|
	case
	when event_attr_array_new_attr11[7] != '-1'--path_condition is supposed to be in DB
	then check_not_null(
	path_condition.new_id,
	array['path_condition for event_tree', event_tree_id::text, 'event_idx', event_idx::text] \|\| 'event_attr_array_new_attr11'::text \|\| event_attr_array_new_attr11)::text
	else '-1'
	end \|\|
	event_attr_array_new_attr11[8:11]
	else
	event_attr_array_new_attr11[4:7]--it is [4:7] for 'R' and [4:6] for 'G', but array_to_string skips null (and therefore non-existing) array elements
	end,
	','
	),
	';' order by event_idx
	) as new_event_lob
	from
	parsed_event_lob_parsed_new_attr11
	--we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using check_not_null
	left join (values (111, -111), (211, -211), (221, -221)) as event_tag (id, new_id) on event_attr_array_new_attr11[2]::bigint = event_tag.id
	left join (values (112, -112), (212, -212), (222, -222)) as event_description (id, new_id) on event_attr_array_new_attr11[3]::bigint = event_description.id
	left join (values (113, -113)) as files (id, new_id) on
	/* PostgreSQL does not have left-to-right short-circuiting of boolean operators (see https://www.postgresql.org/docs/10/sql-expressions.html),
	* so we have to check if the string represents an integer before casting it.*/
	(event_attr_array_new_attr11[1] = 'C' or event_attr_array_new_attr11[1] = 'S')
	and (case when event_attr_array_new_attr11[8] ~ '^-?\d+$' then event_attr_array_new_attr11[8]::bigint else null end) = files.id
	left join (values (114, -114)) as path_condition (id, new_id) on
	/* This is equivalent to
	* case when event_attr_array_new_attr11[1] = 'P'
	* then event_attr_array_new_attr11[7]::bigint = path_condition.id
	* else false
	* end
	* but the execution plan for the CASE variant is much worse. See the execution plans comparison below[1].*/
	event_attr_array_new_attr11[1] = 'P'
	and (case when event_attr_array_new_attr11[7] ~ '^-?\d+$' then event_attr_array_new_attr11[7]::bigint else null end) = path_condition.id
	group by event_tree_id
	)
	select
	event_tree_id,
	new_model_lob,
	new_event_lob,
	/* PostgreSQL md5 function accepts a string instead of bytes, which means its behaviour depends on server_encoding
	* (use "show server_encoding;" to query it).
	* Since the application uses UTF-8 charset for converting a string into bytes,
	* we can use PostgreSQL md5 function only if server_encoding is UTF-8.
	* Fortunately, we require using UTF-8, so we are all good.*/
	md5(coalesce(new_model_lob, '') \|\| '\|' \|\| coalesce(new_event_lob, '')) as events_md5
	from
	new_event_tree_model_lob
	full join new_event_tree_event_lob using (event_tree_id);

	-------------------------------------------------------------------------------------------------------------------------
	--these functions are used by the query above
	create or replace function /pg_temp./check_not_null(
	v bigint,
	debug_info text[])
	returns bigint
	language plpgsql immutable parallel safe
	as $$
	begin
	if v is null then
	raise exception 'The argument must not be null. Debug info: %', debug_info;
	end if;
	return v;
	end;
	$$;

	create or replace function /pg_temp./default_if_null(
	v bigint,
	dflt bigint,
	debug_info text[])
	returns bigint
	language plpgsql immutable parallel safe
	as $$
	begin
	if v is null then
	raise warning 'The argument was null and %s was used instead. Debug info: %', dflt, debug_info;
	return dflt;
	end if;
	return v;
	end;
	$$;

	/*
	* event_tree.mode_lob and event_tree.event_lob contain contain textual data which refers to rows in other tables.
	* It is similar to having FK constraints, but there is no DBMS support. The task is to update parts
	* that refer to rows in other tables with the new identifiers for those rows (the IDs are changed when data is imported).
	* Given that the structure of event_tree.event_lob is far from trivial, doing this in SQL is ugly and not easy,
	* but the alternative (download all the data piece by piece and process it in Java by making more requests to DBMS)
	* is likely not a viable solution performance-wise.
	*
	* The strings are parsed by
	* - splitting them into arrays;
	* - unnesting arrays into tables;
	* - updating elements of arrays (which are arrays on their own) that require updates;
	* - aggregating unnested array elements back into arrays;
	* - joining arrays back into strings.
	* Note that event_tree.mode_lob effectively contains an array of arrays,
	* while event_tree.event_lob may contain array of arrays of arrays (i.e. double nesting)
	* which makes the job of updating event_tree.event_lob much more difficult.
	*
	* Note that indices of PostgreSQL arrays are 1-based
	* and DBMS allows them to go out of bounds of an array in which case the value is considered null.
	*
	* Originally this was an update command, but it was changed to be a select and
	* real tables were replaced with fake ones produced by using values lists, so the query could be run immediately anywhere.
	* Despite being bulky, the original update command was able to process 400_000 rows under 30 seconds.
	*
	*
	* The format of event_tree.mode_lob and event_tree.event_lob.
	* Legend:
	* [attrIdx] - the index of an attribute.
	* FK -> ck - the attribute is conceptually a foreign key that references the candidate key ck (which in our case always happens to be a primary key).
	*
	* event_tree.model_lob format:
	* models are separated by ';'
	* pieces of model data (attributes) are separated by ','
	* attributes:
	* [1] irrelevant
	* [2] FK -> files.id
	* [3] irrelevant
	* [4] FK -> stream_function.id
	* [5] FK -> function_metrics.id
	*
	* event_tree.event_lob format:
	* Attributes of events depend on the event type. Event types are hierarchical, e.g., event type "S",
	* is a subtype of "C". This means that "S" events have all the attributes of "C" events,
	* namely 0 - 9 (of which attributes 0-5 are present on events of any type), but additionally have attributes 10 and 11.
	* The attribute 10 is a set of stack trace elements, each of which itself have 4 attributes.
	* events are separated by ';'
	* pieces of event data (attributes) are separated by ','
	* attributes common for all event types:
	* [0] event type (one of C, S, P, R, G),
	* [1] FK -> event_tag.id,
	* [2] FK -> event_description.id,
	* [3] irrelevant
	* [4] irrelevant
	* [5] irrelevant
	* "C"-specific attributes:
	* [6] irrelevant
	* [7] FK -> files.id//can be -1 which means "no value"
	* [8] irrelevant
	* [9] irrelevant
	* "S"-specific attributes:
	* [10] elements
	* format:
	* elements are separated by '&'
	* pieces of element data (values) are separated by ':'
	* values:
	* [0] irrelevant
	* [1] irrelevant
	* [2] irrelevant
	* [3] FK -> file_instance.id//can be -1 which means "no value"
	* [11] irrelevant
	* "P"-specific attributes:
	* [6] FK -> path_condition.id//can be -1 which means "no value"
	* [7] irrelevant
	* [8] irrelevant
	* [9] irrelevant
	* [10] irrelevant
	* "R"-specific attributes:
	* [6] irrelevant
	* "G"-specific attributes:
	* none
	*/

	/*
	* [1] The execution plans comparison.
	*
	* -> Nested Loop Left Join (cost=4051.85..1183168.20 rows=21400000 width=80)
	* Join Filter: CASE WHEN (parsed_event_lob_parsed_new_attr11.event_attr_array_new_attr11[1] = 'P'::text)
	* THEN ((parsed_event_lob_parsed_new_attr11.event_attr_array_new_attr11[7])::bigint = path_condition.id)
	* ELSE false
	* END
	* -> Nested Loop Left Join (cost=4051.85..220144.82 rows=40000 width=72)
	* ...<a plan for obtaining result_of_previous_operations is omitted>
	* -> Materialize (cost=0.00..26.05 rows=1070 width=16)
	* -> Seq Scan on path_condition (cost=0.00..20.70 rows=1070 width=16)
	* We can see that result_of_previous_operations is estimated to have 40000 rows and PG uses nested loop to join it with 1070 rows from path_condition.
	* This results in about 40000 * 1070 / 2 = 21400000 comparisons (on average the inner loop will have 1070 / 2 iterations for each iteration of the outer loop).
	* So the CASE plan has time complexity O(rows(result_of_previous_operations) * rows(path_condition)), which is quadratic.
	*
	* -> Hash Left Join (cost=4085.92..220298.90 rows=40000 width=80)
	* Hash Cond: (CASE WHEN (parsed_event_lob_parsed_new_attr11.event_attr_array_new_attr11[7] ~ '^-?\d+$'::text)
	* THEN (parsed_event_lob_parsed_new_attr11.event_attr_array_new_attr11[7])::bigint
	* ELSE NULL::bigint
	* END = path_condition.id)
	* Join Filter: (parsed_event_lob_parsed_new_attr11.event_attr_array_new_attr11[1] = 'P'::text)
	* -> Nested Loop Left Join (cost=4051.85..220144.82 rows=40000 width=72)
	* ...<a plan for obtaining result_of_previous_operations is omitted>
	* -> Hash (cost=20.70..20.70 rows=1070 width=16)
	* -> Seq Scan on path_condition (cost=0.00..20.70 rows=1070 width=16)
	* Now the plan is to build a hash table for 1070 rows in path_condition (which is of linear time complexity) and use hash join to join this table with 40000 rows estimated for result_of_previous_operations.
	* This approach results in 40000 hash table lookups, compared to 21400000 comparisons in the original plan.
	* The time complexity for the new join plan is O(rows(result_of_previous_operations) + rows(path_condition)), i.e. linear instead of being quadratic.
	*/