Skip to content

Instantly share code, notes, and snippets.

@sebnyberg
Last active August 30, 2017 20:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebnyberg/428ee3e73dd71b45a0c1c27351f45014 to your computer and use it in GitHub Desktop.
Save sebnyberg/428ee3e73dd71b45a0c1c27351f45014 to your computer and use it in GitHub Desktop.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using DataReader.Extensions;
using iTextSharp.text.log;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
namespace DataReader.ExploredCode
{
internal class OldProgram : IApplication
{
private readonly ILogger _logger;
public OldProgram(ILogger logger)
{
_logger = logger;
}
public void Run()
{
// MANUALLY ENTER VALUES FOR VARIABLES BELOW | (current values fit 2017_TX_5T_EX.pdf)
//const string documentPath = @"C:\Users\Tetratrio\insurancecoverage\src\InsuranceCoverage.DataReader\docs\";
var basePath = AppDomain.CurrentDomain.BaseDirectory;
const string pdfFilename = @"Docs\2017_TX_5T_EX.pdf";
const string csvFilename = @"KaiserDrugTierList.csv";
// Used to extract text from the pages with the tables
const int firstPageTable = 15;
// Used to extract text from the pages with the tables
const int lastPageTable = 159;
// Check manually, looking for the last pagenumber found in textfile
const int pageNumberOfLastPageOfTables = 145;
//###################################################################################
var pdfReader = new PdfReader(basePath + pdfFilename);
var text = ReadPdfAsText(pdfReader, firstPageTable, lastPageTable);
// String ends with page-number + '\n'
var textToRemove = "2017\n" +
"Texas Residents -- Find and estimate prices for medicines on this formulary at:\n" +
"https://www.myprime.com/v/BCBSTX/COMMERCIAL/TXMKTGNPLS/en/find-medicine.html\n" +
"BCBSTX Health Insurance Marketplace 5 Tier Drug List July 2017 ";
for (var i = 1; i <= pageNumberOfLastPageOfTables; i++)
{
text = text.Replace(textToRemove + i + '\n', string.Empty);
}
textToRemove = "Drug Name\n" +
"Drug Tier\n" +
"Prior Authorization\n" +
"Step Therapy\n" +
"Dispensing Limits\n" +
"ACA\n" +
"Limited Distribution\n";
text = text.Replace(textToRemove, string.Empty);
// Remove all dots
text = text.Replace("•", string.Empty);
// Remember where all new lines were
text = text.Replace("\n", " $ ");
char[] separators = { ' ' };
var words = text.Split(separators);
//var drugList = new List<Tuple<string, string>>();
var drugs = new List<(string name, string tier)>();
// var counter = 0;
for (var index = 0; index < words.Length;)
{
// Find first word that isnt all caps.
// Can be lowercase or just a '-' (which is included in all drugs that start with upper case words)
var stringBuilder = new StringBuilder();
// Ignoring all words "$" which marks newlines
for (; words[index].Equals("$") || index < words.Length && words[index].IsUpper(); ++index)
{ }
if (index >= words.Length)
break;
var retrieveWordsUpToIndex = -1;
// Weird rule: Take all words from the first line moving backwards that contains a word with alphabetic character and length > 1.
if (words[index].Equals("-"))
{
retrieveWordsUpToIndex = index;
// Go backwards and find first word with alphabetic char in it.
for (index -= 1; words[index].ContainsAlphabeticCharacter(); index--)
{ }
// Go backwards until at start of current line
for (; !words[index - 1].Equals("$"); index--)
{ }
}
for (; index <= retrieveWordsUpToIndex; index++)
{
if (words[index].Equals("$"))
continue;
stringBuilder.Append(words[index]);
stringBuilder.Append(' ');
}
// Apparently checks if word is a digit
for (; index < words.Length && (!IsTier(words, index) || words[index].Equals("$")); ++index)
{
if (words[index].Equals("$"))
continue;
stringBuilder.Append(words[index]);
stringBuilder.Append(' ');
}
if (index >= words.Length)
break;
//drugs.Add(new Tuple<string, string>(stringBuilder.ToString(), words[index]));
drugs.Add((stringBuilder.ToString(), words[index]));
stringBuilder.Clear();
// i should currently point at the tier for a drug and should thus be incremented once
++index;
}
var sb = new StringBuilder();
foreach (var drug in drugs)
{
//sB.AppendLine("\"" + drug.Item1 + "\",\"" + drug.Item2 + "\"");
sb.AppendLine("\"" + drug.name + "\",\"" + drug.tier + "\"");
}
// Add any drugs from the 'tail' of the table that was cut off
sb.AppendLine("\"" + "ZORTRESS - everolimus tab 0.75 mg" + "\",\"" + "4" + "\"");
System.IO.File.WriteAllText(basePath + csvFilename, sb.ToString());
Console.WriteLine("Extracted information from " + drugs.Count + " drugs.");
Console.ReadKey();
}
private static bool IsTier(IReadOnlyList<string> words, int index)
{
string[] tiers = { "1", "2", "3", "4", "5", "1,2", "A" };
// Special case for last entry in pdf table
if (index == words.Count - 1)
if (words[index - 1].Equals("$"))
return tiers.Contains(words[index]);
// Normal case
if (words[index - 1].Equals("$") && words[index + 1].Equals("$"))
return tiers.Contains(words[index]);
return false;
}
private static string ReadPdfAsText(PdfReader pdfReader, int firstPage, int lastPage)
{
var stringBuilder = new StringBuilder();
for (var currentPage = firstPage; currentPage <= lastPage; currentPage++)
{
var simpleTextExtractionStrategy = new SimpleTextExtractionStrategy();
var pageText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, simpleTextExtractionStrategy);
pageText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(pageText)));
stringBuilder.Append(pageText);
}
return stringBuilder.ToString();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment