HelenCEBM/PCA_data_processing.md Secret

## PCA_data_processing.md

      
    Raw
  

              PCA_data_processing.md
            
          
    A - Data Gathering and Compilation

A1. ONS mid-year England population and ONS annual CPI files are compiled


Annual CPI (All Items) are obtained from ONS using a custom query:


England's mid-year populations are obtained from ONS


Figures are compiled and stored in the cloud


A2. The above files are loaded into BigQuery
As the following tables:

A3. Annual PCA data downloaded and compiled

Data downloaded from here (and links therein).
BNF codes are compiled into a string.
Year is added as first column and columns rearranged as necessary (as in Excel file here).
Formulae are used to multiply up figures given as thousands.
To save as _to_load.csv file, excess blank columns beyond the data columns should be removed, and
BNF codes need preceding zeros. NOTE if csv file is simply reopened in Excel the leading zeros will appear to be removed.
File imported into BigQuery (uploaded via the Cloud) and saved as ebmdatalab.hscic.prescribing_pca_1998_2016_full


B - Construction of Lookup Tables

B1. The special_cases lookup table

This is a workaround to assign a 'most likely' classification to the few problematic drug names which exist multiple times in BNF.
Lookup table is created by running the following script (from Issue #2)

WITH temp as (
SELECT SUBSTR(SECTION_CODE,1,2) as chapter, section_code, presentation, COUNT(DISTINCT product_code) as num 
FROM ( SELECT DISTINCT section_code, section, para, subpara, chemical, product, product_code, presentation FROM ebmdatalab.hscic.bnf )
GROUP BY chapter, section_code, presentation
HAVING num >1   --where name maps to more than one bnf code
ORDER BY chapter, num DESC)

SELECT 
section_code, section, para, subpara, chemical, product, product_code, 
presentation  -- this level is to filter to the top-prescribed code for each drug name (according to latest detailed monthly data) (or, if none were prescribed, then the first product name alphabetically
FROM (
  SELECT  -- this level joins all possible product codes to aggregated prescribing data (2011-16) and ranks by items prescribed. 
    a.*, b.items AS items_2011_2016,
    row_number() OVER (PARTITION BY a.presentation ORDER BY b.items DESC) AS ranking -- We can use this to select the top/most likely drug code
  FROM (  -- this level is to look up all possible product codes for each drug name in current BNF
      SELECT 
      DISTINCT -- here we just want to go to product level rather than individual presentations
      presentation, 
      chapter, chapter_code,
      section, section_code,
      para, para_code,
      subpara, subpara_code,
      chemical, chemical_code,
      product, product_code 
      from ebmdatalab.hscic.bnf where presentation in (select presentation from temp where chapter < '18')

      ) a  -- now join to aggregated dataset grouped up to product level:
      LEFT JOIN ( SELECT substr(bnf_code,1,11) AS product_code, sum(items) as items from ebmdatalab.aggregated_data.all_prescribed_BNFs_UpToSept2016 GROUP BY product_code ) b 
            ON a.product_code = substr(b.product_code,1,11) 
      )
WHERE ranking = 1
ORDER BY chapter_code, presentation, product_code

B2. The lookup table of alternative drug spellings found within the data is created using the script below and saving as ebmdatalab.hscic.drug_name_alt_spellings_in_PCA_data_HC
-- find drug name changes in PCA data to 2016
-- save results as ebmdatalab.hscic.drug_name_alt_spellings_in_PCA_data_HC

WITH 
a AS  (
  SELECT 
      IF(LENGTH(bnf_7_char)=9,SUBSTR(bnf_7_char,2,4),SUBSTR(bnf_7_char,1,4)) AS section_code, -- extra clause added to deal with those with extra spaces 2017-08-01
      SUBSTR(drug_name,1,IF(STRPOS(drug_name,'_')>0,STRPOS(drug_name,'_')-1,length(drug_name))) AS drug_name_part,  --take first part of drug name, up to underscore (if there is one)
      MIN(year) AS min_year,  --this will help us to see which are the older vs newer spellings used
      MAX(Year) AS max_year,
      SUM(items) AS Items
  FROM ebmdatalab.hscic.prescribing_pca_1998_2016_full
  WHERE  IF(LENGTH(bnf_7_char)=9,SUBSTR(bnf_7_char,2,2),SUBSTR(bnf_7_char,1,2)) < '18'
  GROUP BY 
  section_code,
  drug_name_part
  ),
  
 b AS  
(SELECT DISTINCT
section_code,
drug_name_part,
REPLACE(REPLACE(drug_name_part, 'i', '__'),'y','__') AS IY, 
REPLACE(REPLACE(drug_name_part, 's', '__'),'z','__') AS SZ,
REPLACE(REPLACE(drug_name_part, 'ph', '__'),'f','__') AS PHF,
CONCAT(drug_name_part,'e') AS E,   -- add and E on to the end (note this only works for the LAST word)
REPLACE(drug_name_part, ' ', ' e') AS E_mid -- add an E on to the end of all words occurring before a space
FROM a)


SELECT 
a.section_code,
a.drug_name_part,
CAST(a.min_year AS STRING) AS start_date,
CAST(a.max_year AS STRING) AS end_date,
a.items,
b.drug_name_part AS alternative,
CASE WHEN REPLACE(REPLACE(a.drug_name_part, 'i', '__'),'y','__') = b.IY THEN 'i_y'
      WHEN  REPLACE(REPLACE(a.drug_name_part, 's', '__'),'z','__') = b.SZ THEN 's_z'
      WHEN  REPLACE(REPLACE(a.drug_name_part, 'ph', '__'),'f','__') = b.PHF THEN 'ph_f'
      WHEN  a.drug_name_part = b.E OR CONCAT(a.drug_name_part,'e') = b.drug_name_part THEN 'e_end'
      WHEN  a.drug_name_part = b.E_mid OR REPLACE(a.drug_name_part, ' ', 'e ') =b.drug_name_part THEN 'e_end'
      END AS type
FROM a
INNER JOIN b 
  ON (REPLACE(REPLACE(a.drug_name_part, 'i', '__'),'y','__') = b.IY 
      OR REPLACE(REPLACE(a.drug_name_part, 's', '__'),'z','__') = b.SZ 
      OR REPLACE(REPLACE(a.drug_name_part, 'ph', '__'),'f','__') = b.PHF
      OR a.drug_name_part = b.E   --note this will only show this match once, so we put in the other way around also
      OR a.drug_name_part = b.E_mid
      OR CONCAT(a.drug_name_part,'e') = b.drug_name_part 
      OR REPLACE(a.drug_name_part, ' ', 'e ') =b.drug_name_part)   
  AND a.drug_name_part != b.drug_name_part
  AND a.section_code = b.section_code
ORDER BY items desc

B3. The lookup table of Chemical name changes is created using the script below and saving as pca_chemical_old_to_new_lookup
-- PCA data - finding up to date chemical to combine with dataset
--save results as ebmdatalab.hscic.pca_chemical_old_to_new_lookup 
WITH A as (
  SELECT 
    IF(LENGTH(bnf_7_char)=9,SUBSTR(bnf_7_char,2,4),SUBSTR(bnf_7_char,1,4)) AS Section, 
    drug_name, 
    count(distinct chemical) AS chems,
    max(year) AS Max_year_overall
    FROM
    ebmdatalab.hscic.prescribing_pca_1998_2016_full 
    where IF(LENGTH(bnf_7_char)=9,SUBSTR(bnf_7_char,2,2),SUBSTR(bnf_7_char,1,2)) <'18'
    GROUP BY 
    Section, 
    drug_name
    HAVING chems >1
    ),
    
B AS (
SELECT IF(LENGTH(bnf_7_char)=9,SUBSTR(bnf_7_char,2,4),SUBSTR(bnf_7_char,1,4)) AS section, drug_name, chemical,
    min(year) AS Min_year, 
    max(year) AS Max_year
    FROM ebmdatalab.hscic.prescribing_pca_1998_2016_full
GROUP BY Section, drug_name, chemical
    ),
    
C AS (
SELECT DISTINCT 
    A.Section, 
    A.drug_name, 
    B.chemical,
    B.min_year,
    B.max_year,
    IF(max_year = Max_year_overall,1,0) AS latest
    FROM A LEFT JOIN B ON A.drug_name = B.drug_name AND A.Section = B.Section
ORDER BY drug_name, chemical
)

SELECT old.section, old.drug_name, old.chemical AS old_chemical_name, 
    nw.chemical AS new_chemical_name, nw.min_year AS Since
    FROM c old
     LEFT JOIN c nw ON old.drug_name = nw.drug_name AND old.chemical != nw.chemical and nw.latest = 1
    WHERE old.latest = 0
ORDER BY old.section, old.drug_name 

B4. Known drug name changes

As reported online by patient.info
ebmdatalab.hscic.drug_name_changes_2013

B5. Fuzzy lookup for drugs not matching to BNF

List of drugs not matching identified through earlier iterations of the code.
These drugs were matched to BNF via fuzzy lookup in Excel and manually checked by a pharmacist.
List available here and stored as ebmdatalab.hscic.pca_bnf_name_to_code_fuzzy_lookup

C - Data Extraction and Normalisation

C1. The latest chemical name for each drug is appended into the full dataset, to create prescribing_pca_1998_2016_full_v2

This does not take into account spelling changes but those will be handled later

-- save results as ebmdatalab.hscic.prescribing_pca_1998_2016_full_v2

SELECT a.*, COALESCE(c2.new_chemical_name, a.chemical) AS new_chemical_name FROM 
ebmdatalab.hscic.prescribing_pca_1998_2016_full a

LEFT JOIN ebmdatalab.hscic.pca_chemical_old_to_new_lookup c2 
   ON a.drug_name = c2.drug_name 
   AND a.chemical = c2.old_chemical_name 
   AND IF(LENGTH(bnf_7_char)=9,SUBSTR(bnf_7_char,2,4),SUBSTR(bnf_7_char,1,4)) = c2.section

C2. Run final data extraction parts 1 and 2 (scripts copied and updated from Issues #6 and #7)
C2a. Part 1
-- Final PCA data extraction part 1 (2016)
-- save results as ebmdatalab.tmp_eu.trends_from_pca

WITH 
temp AS 
(SELECT DISTINCT X.section_code, X.drug_name_part AS old_name, X.alternative FROM 
ebmdatalab.hscic.drug_name_alt_spellings_in_PCA_data_HC X
INNER JOIN ebmdatalab.hscic.drug_name_alt_spellings_in_PCA_data_2016_HC Y ON X.alternative = Y.drug_name_part AND Y.end_date = '2016'
),

b AS (
  SELECT DISTINCT 
   chapter_code, chapter, section_code, section, para, subpara, chemical, product, product_code, 
   REPLACE(presentation,'GlucOsamine','prop-GlucOsamine') AS presentation,REPLACE(presentation,' ','') 
     AS presentation_no_spaces 
   FROM ebmdatalab.hscic.bnf
   WHERE presentation NOT IN (SELECT presentation from ebmdatalab.hscic.bnf_name_to_product_special_cases_helen)
   AND chapter_code <'18'),
   
a0 AS (
SELECT *,
      TRIM(bnf_7_char) AS bnf_7_char_trim,
      SUBSTR(drug_name,1,IF(STRPOS(drug_name,'_')>0,STRPOS(drug_name,'_')-1,length(drug_name))) 
        AS drug_name_part,
      SUBSTR(drug_name,1,IF(STRPOS(drug_name,' ')>0,STRPOS(drug_name,' ')-1,length(drug_name))) 
        AS drug_name_part_short,
      SUBSTR(chemical,1,IF(STRPOS(chemical,' ')>0,STRPOS(chemical,' ')-1,length(chemical))) 
        AS chemical_short,
      REPLACE(drug_name,'GlucOsamine','prop-GlucOsamine') AS drug_name_a,
      REPLACE(REPLACE(drug_name,'GlucOsamine','prop-GlucOsamine'),'Sulph','Sulf') AS drug_name_b,
      REPLACE(new_chemical_name,'Sulph','Sulf') AS new_chemical_name_b
      FROM ebmdatalab.hscic.prescribing_pca_1998_2016_full_v2 a
      WHERE IF(LENGTH(bnf_7_char)=9,SUBSTR(bnf_7_char,2,2),SUBSTR(bnf_7_char,1,2)) < '18'), 
      
a1 AS (SELECT a0.*, 
        z.new_bnf_code AS code_fuzzy,
        z.new_name AS drug_name_fuzzy,
        CONCAT( UPPER(substr(d.new_name,1,1)), substr(D.new_name,2,LENGTH(D.new_name)-1) ) AS product_2013, 
        E.alternative AS product_new_spelling,
        CONCAT( UPPER(substr(d1.new_name,1,1)), substr(D1.new_name,2,LENGTH(D1.new_name)-1) ) AS chemical_2013,  -- note, this capitalises the first letter only
        replace(a0.new_chemical_name_b,a0.chemical_short,D3.new_name) AS chemical_2013b,
        replace(a0.drug_name_b,a0.drug_name_part,e.alternative) AS converted_drug_name, -- incorporate new spellings into drug name
        replace(a0.drug_name_b,a0.drug_name_part,D.new_name) AS converted_drug_name2,
        replace(a0.drug_name_b,a0.drug_name_part_short,D2.new_name) AS converted_drug_name3,
        SUBSTR(drug_name_b,1,IF(STRPOS(drug_name_b,'_')>0,STRPOS(drug_name_b,'_')-1,length(drug_name_b))) AS drug_name_part_b

        FROM a0 
      LEFT JOIN ebmdatalab.hscic.drug_name_changes_2013 D ON LOWER(drug_name_part) = D.old_name
      LEFT JOIN ebmdatalab.hscic.drug_name_changes_2013 D1 ON LOWER(a0.chemical) = D1.old_name
      LEFT JOIN ebmdatalab.hscic.drug_name_changes_2013 D2 ON LOWER(drug_name_part_short) = D2.old_name
      LEFT JOIN ebmdatalab.hscic.drug_name_changes_2013 D3 ON LOWER(a0.chemical_short) = D3.old_name
      LEFT JOIN temp E ON a0.drug_name_part = E.old_name AND SUBSTR(a0.bnf_7_char_trim,1,4) = e.section_code 
      LEFT JOIN ebmdatalab.hscic.pca_bnf_name_to_code_fuzzy_lookup z ON A0.drug_name = z.old_name
      ),
      
      
--CAPITALISE WHERE NEEDED:
A2 AS (
SELECT *,
      CONCAT( UPPER(substr(converted_drug_name2,1,1)), substr(converted_drug_name2,2,LENGTH(converted_drug_name2)-1) ) AS converted_drug_name4,
      CONCAT( UPPER(substr(converted_drug_name3,1,1)), substr(converted_drug_name3,2,LENGTH(converted_drug_name3)-1) ) AS converted_drug_name5,
      CONCAT( UPPER(substr(chemical_2013b,1,1)), substr(chemical_2013b,2,LENGTH(chemical_2013b)-1) ) AS chemical_2013_c      --capitalise chemical as well. 
      FROM A1 
      ),

--COALESCE TO FORM "FINAL" NAMES
a3 AS (
 SELECT *,
COALESCE(converted_drug_name,converted_drug_name4,converted_drug_name5,drug_name_fuzzy,drug_name_b) 
   AS drug_name_F, 
 COALESCE(product_new_spelling,product_2013,drug_name_part_b) AS drug_name_part_F,
 COALESCE(chemical_2013, chemical_2013_c, new_chemical_name_b) AS chemical_F
 from A2 
 ),

 --add a drug name field without spaces:
 a4 AS (
 select *, REPLACE(drug_name_F,' ','')  as drug_name_F_no_spaces
 from A3
 ),
 
 
a AS  (
  SELECT 
      x.bnf_7_char_trim AS bnf_code,
      x.drug_name,
      drug_name_F,
      COALESCE(spc.presentation,b.presentation,ba.presentation, bb.presentation,bc.presentation, bd.presentation) 
        AS current_bnf_name,
      COALESCE(spc.product_code,b.product_code,ba.product_code, bb.product_code,bc.product_code, bd.product_code) 
        AS current_bnf_code,
      drug_name_part,
      drug_name_part_F, -- use as product name if no other
      x.section,
      x.subpara,
      x.chemical AS Chemical_original,
      x.chemical_F AS Chemical,
      x.Year,
      SUM(x.owc2) AS OWC2,   -- prescribed generically but no generic available
      SUM(x.NIC) AS Cost,
      SUM(x.items) AS Items,
      SUM(x.quantity) AS Quantity
  FROM a4 x
    --AND A.Currently_in_BNF = 'N' 
    LEFT JOIN ebmdatalab.hscic.bnf_name_to_product_special_cases_helen spc ON upper(x.drug_name_F) = upper(spc.presentation)  -- look up original drug details in current bnf (drugs matching more than one drug in bnf)
    LEFT JOIN b ON upper(x.drug_name_F) = upper(b.presentation)   -- use upper to match up examples like this: "Pentasa Sr_Tab 250mg" and "Pentasa SR_Tab 250mg"
            AND SUBSTR(x.bnf_7_char_trim,1,4) = b.section_code   -- look up original drug details in current bnf.
            
    LEFT JOIN b ba ON upper(x.drug_name_F) = upper(ba.presentation) 
            AND SUBSTR(x.bnf_7_char_trim,1,4) != ba.section_code  -- check if drug now only belongs in a different section but same chapter
            AND SUBSTR(x.bnf_7_char_trim,1,2) = ba.chapter_code 
    LEFT JOIN b bb ON upper(x.drug_name_F) = upper(bb.presentation) 
            AND SUBSTR(x.bnf_7_char_trim,1,2) != bb.chapter_code  -- check if drug now only belongs in a different chapter
    LEFT JOIN b bc ON upper(x.drug_name) = upper(bc.presentation) AND b.presentation IS NULL AND ba.presentation IS NULL AND bb.presentation IS NULL  --also check original in case new drug name didn't work e.g. nifedipin(e)
            AND SUBSTR(x.bnf_7_char_trim,1,4) = bc.section_code  
    LEFT JOIN b bd ON x.drug_name_F_no_spaces = bd.presentation_no_spaces   -- match without spaces e.g. Terbut Sulf_Inha 250mcg (400 D) vs "(400D)"
            AND SUBSTR(x.bnf_7_char_trim,1,4) = bd.section_code  AND b.presentation IS NULL AND ba.presentation IS NULL AND bb.presentation IS NULL   -- look up original drug details in current bnf.
 
  GROUP BY 
      bnf_code, drug_name, drug_name_F, current_bnf_name, current_bnf_code,
      drug_name_part, drug_name_part_F, -- use as product name if no other
      section, subpara, Chemical_original, Chemical, Year
      )
      
SELECT 
    a.bnf_code,
    a.current_bnf_code AS Product_code_updated,
    SUBSTR(COALESCE(a.current_bnf_code,b.product_code,a.bnf_code),1,2) AS Chapter_code_current,
    SUBSTR(a.bnf_code,1,2) AS BNF_Chap_Code,
    COALESCE(b.chapter, ch.description) AS Chapter_Current,
    ch.description AS Chapter_original,
    SUBSTR(COALESCE(a.current_bnf_code,a.bnf_code),3,2) AS Section_code_current,
    SUBSTR(bnf_code,3,2) AS BNF_Section_Code,
    COALESCE(b.section,se.description,a.section) AS Section_Current,
    a.section AS Section_Original,
    SUBSTR(COALESCE(a.current_bnf_code,b.product_code,a.bnf_code),5,2) AS Para_code_current,
    COALESCE(b.para,pa.description) As Para_current,
    COALESCE(b.subpara,a.subpara) As Subpara_current,
    a.subpara AS Subpara_original,
    COALESCE(b.chemical,a.chemical) As Chemical_current,
    a.Chemical_original,
    COALESCE(b.product, a.drug_name_part_F) AS Product_current,
    current_bnf_name,
    a.drug_name,
    IF(b.product_code IS NULL,'N','Y') AS Currently_in_BNF,
    a.year,
    a.Items,
    a.owc2,
    a.Quantity,
    a.Cost
    
  FROM a
  LEFT JOIN ebmdatalab.hscic.bnf_vertical ch ON SUBSTR(a.bnf_code,1,2) = ch.code
  LEFT JOIN ebmdatalab.hscic.bnf_vertical se ON SUBSTR(a.bnf_code,1,4) = se.code
  LEFT JOIN ebmdatalab.hscic.bnf_vertical pa ON SUBSTR(a.bnf_code,1,6) = pa.code
  LEFT JOIN b ON a.current_bnf_name = b.presentation   
            AND a.current_bnf_code = b.product_code

Save results as ebmdatalab.tmp_eu.trends_from_pca_2016
C2b. Part 2
-- final pca data extraction (2016) part 2
-- distinct product-chemical combinations in current BNF:
WITH
 chem_p AS (
   SELECT DISTINCT product, product_code, chemical_code, chemical,
   count (distinct product_code) Over (partition by chemical_code, product) 
      AS Dist_prods_with_same_name
   FROM ebmdatalab.hscic.bnf 
   WHERE chapter_code <'18' 
   ORDER BY Dist_prods_with_same_name, product),

-- find all drug_name_parts in PCA which have been mapped to a new chemical:
chem_0 AS (
  SELECT
  SUBSTR(drug_name,1,IF(STRPOS(drug_name,'_')>0,STRPOS(drug_name,'_')-1,length(drug_name))) 
      AS drug_name_part, 
  drug_name, section, old_chemical_name, new_chemical_name
  FROM ebmdatalab.hscic.pca_chemical_old_to_new_lookup_2016),

-- distinct *chemicals* in current BNF: 
chem_a AS (
SELECT chemical,  
    count(distinct chapter) AS Chapters,
    count(distinct section) AS Sections, 
    count(distinct para) AS Paras, 
    count(distinct chemical_code) AS Codes, 
    min(chemical_code) AS min_code
  FROM ebmdatalab.hscic.bnf
  WHERE chapter_code < '18'
  GROUP BY chemical 
  ORDER BY codes DESC, paras DESC, chemical),
  
-- for chemicals with multiple codes:
-- check whether each chemical code is the only one in its paragraph / section / chapter
chem_a1 AS 
(SELECT DISTINCT 
  a.chemical, b.chemical_code, a.paras, b.para_code, a.sections,b.section_code, a.chapters, b.chapter_code, 
  count(distinct b.chemical_code) over (partition by b.chemical,chapter_code) 
    AS appearances_by_chapter,
  count(distinct b.chemical_code) over (partition by b.chemical,section_code)  
    AS appearances_by_section,
  count(distinct b.chemical_code) over (partition by b.chemical,para_code) 
    AS appearances_by_para
  FROM ebmdatalab.hscic.bnf b
  INNER JOIN chem_a a ON a.chemical = b.chemical and a.codes > 1
  WHERE b.chapter_code < '18'
  ORDER BY chemical ),


-- SELECT ALL CHEMICALS FROM BNF WHICH MAP TO A SINGLE PRODUCT
-- used in final step only
b AS (
  SELECT DISTINCT 
   chapter_code, chapter, section_code, section, para_code, para, subpara_code, subpara, chemical_code
   FROM ebmdatalab.hscic.bnf
   WHERE chapter_code <'18'),
   
   
t as (
 SELECT t.*, 
 REPLACE(c2.new_chemical_name,'Streptokinase-Streptodornase','Streptokinase & Streptodornase') 
      AS new_chemical_name,
 CASE WHEN Product_current LIKE 'Levonelle%' THEN '0703050A0BC'   -- 'Levonelle'
      WHEN Product_current LIKE 'Postinor%' THEN '0703050A0BB'    -- 'Postinor'
      WHEN t.drug_name LIKE 'Terbut%Sulph_Syr%' THEN '0301011V0AA'  -- 'Terbut Sulf'
      WHEN t.drug_name LIKE 'Thalidomide%' AND Chapter_code_current = '05' THEN '0501100J0AA' -- 'Thalidomide (Antileprotic)'
      WHEN Product_current LIKE 'Menoring 50' THEN '0702010G0BE'  -- 'Menoring 50'
      WHEN t.drug_name = 'Acetylcy_Eye Dps 10% (Old)' THEN '1108010C0AA' -- 'Acetylcy (Eye)'
      WHEN t.drug_name = 'Abilify Maintena_Inj 400mg Vl + Dil' THEN '0402020ADBB' -- 'Abilify Maintena'
      WHEN Product_current LIKE 'Melatonin%' THEN '0401010ADAA' -- 'Melatonin'
      WHEN Product_current LIKE 'Varidase%' THEN '1311070R0BB' -- 'Varidase' 
      WHEN t.drug_name = 'Cocois_Scalp Oint' THEN '1305020V0BB' -- 'Cocois'
      WHEN t.drug_name = 'Levocarnitine_Oral Soln Paed 1.5g/5ml30%' THEN '0908010C0AA' -- 'Levocarnitine'
      ELSE product_code_updated
      END AS product_code_updated_manual,
 CASE WHEN Product_current LIKE 'Levonelle%' THEN 'Levonelle'
      WHEN Product_current LIKE 'Postinor%' THEN 'Postinor'
      WHEN t.drug_name LIKE 'Terbut%Sulph_Syr%' THEN 'Terbut Sulf'
      WHEN t.drug_name LIKE 'Thalidomide%' AND Chapter_code_current = '05' THEN 'Thalidomide (Antileprotic)'
      WHEN Product_current LIKE 'Menoring 50' THEN 'Menoring 50'
      WHEN t.drug_name = 'Acetylcy_Eye Dps 10% (Old)' THEN 'Acetylcy (Eye)'
      WHEN t.drug_name = 'Abilify Maintena_Inj 400mg Vl + Dil' THEN 'Abilify Maintena'
      WHEN Product_current LIKE 'Melatonin%' THEN 'Melatonin'
      WHEN Product_current LIKE 'Varidase%' THEN 'Varidase' 
      WHEN t.drug_name = 'Cocois_Scalp Oint' THEN 'Cocois'
      WHEN t.drug_name = 'Levocarnitine_Oral Soln Paed 1.5g/5ml30%' THEN 'Levocarnitine'
      ELSE product_current
      END AS product_current_manual
 FROM ebmdatalab.helen.trends_from_pca_2016 t
 LEFT JOIN chem_0 c2 ON t.drug_name = c2.drug_name AND t.chemical_current = c2.old_chemical_name AND SUBSTR(t.bnf_code,1,4) = c2.section
 ),
 
 
  A AS (
 SELECT T.*, 
 COALESCE(chem_p.product,c2.product,product_current_manual) AS current_product, 
    -- use this order in coalesce because we want to update/replace any existing product names for which we now have a better one. 
 COALESCE(product_code_updated_manual,chem_p.product_code,c2.product_code) 
    AS current_product_code,
 COALESCE(chem_p.chemical,c2.chemical,chem_a.chemical,chem_a1.chemical,c3.chemical) 
    AS unique_chem,  -- chemicals currently in BNF (uniquely)
 COALESCE(chem_p.chemical_code,c2.chemical_code,chem_a.min_code,chem_a1.chemical_code,c3.min_code) 
    AS unique_chem_code

 FROM  t
  -- link to BNF using whole Product name (note this will be drug_name_part) --------------------
  -- chemical must match as well because product names are not always unique.
  LEFT JOIN chem_p ON t.product_current = chem_p.product 
            AND t.Product_code_updated_manual IS NULL
            AND (UPPER(chem_p.chemical) = UPPER(Chemical_current) 
                      OR  UPPER(chem_p.chemical) = UPPER(new_chemical_name)) 
            AND SUBSTR(chem_p.chemical_code,1,6) = SUBSTR(bnf_code,1,6)  
                      -- some chemicals sit in multiple paras.
            AND chem_p.Dist_prods_with_same_name = 1 
    
  -- try shortening Product names in BNF to match products in data (only if whole name is not found) --
    LEFT JOIN chem_p c2 ON t.product_current = SUBSTR(c2.product,1,length(t.product_current)) 
            AND t.Product_code_updated_manual IS NULL 
            AND chem_p.product IS NULL
            AND UPPER(c2.chemical) IN (UPPER(Chemical_current), UPPER(new_chemical_name)) 
            AND SUBSTR(c2.chemical_code,1,6) = SUBSTR(bnf_code,1,6)  --some chems sit in multiple paras.
            AND chem_p.Dist_prods_with_same_name = 1

  -- link to BNF using "original" chemical name for chemicals which are unique in BNF  ---------------
  LEFT JOIN chem_a ON UPPER(chem_a.chemical)  = UPPER(Chemical_current)  
            AND chem_a.codes = 1 AND chem_p.chemical IS NULL 
            AND t.Product_code_updated_manual IS NULL
  -- link to BNF using NEW chemical name for chemicals which are unique in BNF  --------------------
  LEFT JOIN chem_a c3 ON c3.chemical  = new_chemical_name  
            AND c3.codes = 1 AND chem_a.chemical IS NULL 
            AND chem_p.chemical IS NULL 
            AND t.Product_code_updated_manual IS NULL
  -- link to BNF using NEW chemical name for chemicals which are NON-unique in BNF  --------------
  -- provided that no chemical has been assigned in a previous join.
  -- first check same paragraph then section then chapter.
  LEFT JOIN chem_a1 ON chem_a1.chemical  = Chemical_current  
            AND chem_p.chemical IS NULL 
            AND chem_a.chemical IS NULL 
            AND c3.chemical IS NULL 
            AND t.Product_code_updated_manual IS NULL
            AND (
              (chem_a1.para_code = SUBSTR(bnf_code,1,6) AND chem_a1.appearances_by_para = 1) 
              OR (chem_a1.section_code = SUBSTR(bnf_code,1,4) AND chem_a1.appearances_by_section = 1) 
              OR (chem_a1.chapter_code = SUBSTR(bnf_code,1,2) AND chem_a1.appearances_by_chapter = 1)
                      )
 ORDER BY drug_name,year ),
 
 u AS (
 select bnf_code,	
 Chapter_code_current,	BNF_Chap_Code,	Chapter_Current, Chapter_original,	
 Section_code_current,	BNF_Section_Code,	Section_Current,	Section_Original,	
 Para_code_current,	Para_current,	
 Subpara_current,	Subpara_original,	
 COALESCE(unique_chem_code, SUBSTR(Product_code_updated,1,9), SUBSTR(current_Product_code,1,9)) 
   AS chem_code_today, --chemical code
 Chemical_original,	
 COALESCE(unique_chem,Chemical_current) AS chem_today,
 COALESCE(Product_code_updated, current_product_code)	 AS prod_code_today,
 COALESCE(current_product,	Product_current) AS prod_today, 
   -- note this is opposite way around to code because we want to replace the previous name 
   -- but there may not be a code.
 current_bnf_name, drug_name,	
 Currently_in_BNF,  
 year,  Items,  owc2,  Quantity,  Cost
 FROM a 
 ORDER BY drug_name, year)
 
 
SELECT 
    bnf_code,	
    COALESCE(b.chapter_code,Chapter_code_current) AS Chapter_code_current,
    BNF_Chap_Code,	
    COALESCE(b.chapter,Chapter_Current) AS Chapter_Current,
    Chapter_original,	
    COALESCE(SUBSTR(b.section_code,3,2),Section_code_current) AS Section_code_current,	
    BNF_Section_Code,	
    COALESCE(b.section,Section_Current) AS Section_current,	
    Section_Original,	
    COALESCE(SUBSTR(b.para_code,5,2),Para_code_current) AS Para_code_current,	
    COALESCE(b.para,Para_current) AS Para_current,	
    COALESCE(b.subpara,Subpara_current) AS Subpara_current,	
    Subpara_original,	
    chem_code_today AS Chemical_code_current,
    Chemical_original,	
    chem_today AS Chemical_current,
    prod_code_today AS Prod_code_current,
    prod_today AS product_current,
    current_bnf_name, drug_name,	
    Currently_in_BNF, u.year,  Items,  owc2,  Quantity,  Cost,
    -- add calculated fields:   
    1000*items/pop.Population AS ItemsPer1000,
    1000*quantity/pop.Population AS QuantityPer1000,
    Inf.Multiplier_2016*cost AS Infl_corr_Cost,
    1000*Inf.Multiplier_2016*cost/pop.Population AS Infl_corr_Cost_per1000,
    IEEE_DIVIDE(Inf.Multiplier_2016*Cost, Items) AS Infl_corr_CostPerItem,
    1000*owc2/pop.Population AS Owc2Per1000
 
 FROM U 
 LEFT JOIN b ON u.chem_code_today = b.chemical_code
 LEFT JOIN ebmdatalab.ONS.england_midyear_population pop ON u.Year = pop.Year
 LEFT JOIN ebmdatalab.ONS.inflation_cpi inf ON u.Year = inf.Year
 --WHERE LENGTH(chem_code_today) =8