sebnyberg/bruteforce

## bruteforce
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using DataReader.Extensions;
using iTextSharp.text.log;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;

namespace DataReader.ExploredCode
{
    internal class OldProgram : IApplication
    {
        private readonly ILogger _logger;

        public OldProgram(ILogger logger)
        {
            _logger = logger;
        }
        public void Run()
        {
            // MANUALLY ENTER VALUES FOR VARIABLES BELOW | (current values fit 2017_TX_5T_EX.pdf)
            //const string documentPath = @"C:\Users\Tetratrio\insurancecoverage\src\InsuranceCoverage.DataReader\docs\";
            var basePath = AppDomain.CurrentDomain.BaseDirectory;

            const string pdfFilename = @"Docs\2017_TX_5T_EX.pdf";

            const string csvFilename = @"KaiserDrugTierList.csv";

            // Used to extract text from the pages with the tables
            const int firstPageTable = 15;

            // Used to extract text from the pages with the tables
            const int lastPageTable = 159;

            // Check manually, looking for the last pagenumber found in textfile
            const int pageNumberOfLastPageOfTables = 145;

            //###################################################################################

            var pdfReader = new PdfReader(basePath + pdfFilename);

            var text = ReadPdfAsText(pdfReader, firstPageTable, lastPageTable);

            // String ends with page-number + '\n'
            var textToRemove = "2017\n" +
                         "Texas Residents -- Find and estimate prices for medicines on this formulary at:\n" +
                         "https://www.myprime.com/v/BCBSTX/COMMERCIAL/TXMKTGNPLS/en/find-medicine.html\n" +
                         "BCBSTX Health Insurance Marketplace 5 Tier Drug List July 2017 ";

            for (var i = 1; i <= pageNumberOfLastPageOfTables; i++)
            {
                text = text.Replace(textToRemove + i + '\n', string.Empty);
            }

            textToRemove = "Drug Name\n" +
                         "Drug Tier\n" +
                         "Prior Authorization\n" +
                         "Step Therapy\n" +
                         "Dispensing Limits\n" +
                         "ACA\n" +
                         "Limited Distribution\n";

            text = text.Replace(textToRemove, string.Empty);

            // Remove all dots
            text = text.Replace("•", string.Empty);

            // Remember where all new lines were
            text = text.Replace("\n", " $ ");

            char[] separators = { ' ' };
            var words = text.Split(separators);

            //var drugList = new List<Tuple<string, string>>();
            var drugs = new List<(string name, string tier)>();

            // var counter = 0;

            for (var index = 0; index < words.Length;)
            {
                // Find first word that isnt all caps.
                // Can be lowercase or just a '-' (which is included in all drugs that start with upper case words)
                var stringBuilder = new StringBuilder();

                // Ignoring all words "$" which marks newlines
                for (; words[index].Equals("$") || index < words.Length && words[index].IsUpper(); ++index)
                { }

                if (index >= words.Length)
                    break;

                var retrieveWordsUpToIndex = -1;

                // Weird rule: Take all words from the first line moving backwards that contains a word with alphabetic character and length > 1.
                if (words[index].Equals("-"))
                {
                    retrieveWordsUpToIndex = index;

                    // Go backwards and find first word with alphabetic char in it.
                    for (index -= 1; words[index].ContainsAlphabeticCharacter(); index--)
                    { }

                    // Go backwards until at start of current line
                    for (; !words[index - 1].Equals("$"); index--)
                    { }
                }

                for (; index <= retrieveWordsUpToIndex; index++)
                {
                    if (words[index].Equals("$"))
                        continue;

                    stringBuilder.Append(words[index]);
                    stringBuilder.Append(' ');
                }

                // Apparently checks if word is a digit
                for (; index < words.Length && (!IsTier(words, index) || words[index].Equals("$")); ++index)
                {
                    if (words[index].Equals("$"))
                        continue;

                    stringBuilder.Append(words[index]);
                    stringBuilder.Append(' ');
                }

                if (index >= words.Length)
                    break;

                //drugs.Add(new Tuple<string, string>(stringBuilder.ToString(), words[index]));
                drugs.Add((stringBuilder.ToString(), words[index]));

                stringBuilder.Clear();

                // i should currently point at the tier for a drug and should thus be incremented once
                ++index;
            }

            var sb = new StringBuilder();

            foreach (var drug in drugs)
            {
                //sB.AppendLine("\"" + drug.Item1 + "\",\"" + drug.Item2 + "\"");
                sb.AppendLine("\"" + drug.name + "\",\"" + drug.tier + "\"");
            }

            // Add any drugs from the 'tail' of the table that was cut off

            sb.AppendLine("\"" + "ZORTRESS - everolimus tab 0.75 mg" + "\",\"" + "4" + "\"");

            System.IO.File.WriteAllText(basePath + csvFilename, sb.ToString());

            Console.WriteLine("Extracted information from " + drugs.Count + " drugs.");
            Console.ReadKey();
        }


        private static bool IsTier(IReadOnlyList<string> words, int index)
        {
            string[] tiers = { "1", "2", "3", "4", "5", "1,2", "A" };

            // Special case for last entry in pdf table
            if (index == words.Count - 1)
                if (words[index - 1].Equals("$"))
                    return tiers.Contains(words[index]);

            // Normal case
            if (words[index - 1].Equals("$") && words[index + 1].Equals("$"))
                return tiers.Contains(words[index]);
            return false;
        }


        private static string ReadPdfAsText(PdfReader pdfReader, int firstPage, int lastPage)
        {
            var stringBuilder = new StringBuilder();

            for (var currentPage = firstPage; currentPage <= lastPage; currentPage++)
            {
                var simpleTextExtractionStrategy = new SimpleTextExtractionStrategy();
                var pageText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, simpleTextExtractionStrategy);

                pageText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(pageText)));
                stringBuilder.Append(pageText);
            }

            return stringBuilder.ToString();
        }

    }
}
	using System;
	using System.Collections.Generic;
	using System.Linq;
	using System.Text;
	using DataReader.Extensions;
	using iTextSharp.text.log;
	using iTextSharp.text.pdf;
	using iTextSharp.text.pdf.parser;

	namespace DataReader.ExploredCode
	{
	internal class OldProgram : IApplication
	{
	private readonly ILogger _logger;

	public OldProgram(ILogger logger)
	{
	_logger = logger;
	}
	public void Run()
	{
	// MANUALLY ENTER VALUES FOR VARIABLES BELOW \| (current values fit 2017_TX_5T_EX.pdf)
	//const string documentPath = @"C:\Users\Tetratrio\insurancecoverage\src\InsuranceCoverage.DataReader\docs\";
	var basePath = AppDomain.CurrentDomain.BaseDirectory;

	const string pdfFilename = @"Docs\2017_TX_5T_EX.pdf";

	const string csvFilename = @"KaiserDrugTierList.csv";

	// Used to extract text from the pages with the tables
	const int firstPageTable = 15;

	// Used to extract text from the pages with the tables
	const int lastPageTable = 159;

	// Check manually, looking for the last pagenumber found in textfile
	const int pageNumberOfLastPageOfTables = 145;

	//###################################################################################

	var pdfReader = new PdfReader(basePath + pdfFilename);

	var text = ReadPdfAsText(pdfReader, firstPageTable, lastPageTable);

	// String ends with page-number + '\n'
	var textToRemove = "2017\n" +
	"Texas Residents -- Find and estimate prices for medicines on this formulary at:\n" +
	"https://www.myprime.com/v/BCBSTX/COMMERCIAL/TXMKTGNPLS/en/find-medicine.html\n" +
	"BCBSTX Health Insurance Marketplace 5 Tier Drug List July 2017 ";

	for (var i = 1; i <= pageNumberOfLastPageOfTables; i++)
	{
	text = text.Replace(textToRemove + i + '\n', string.Empty);
	}

	textToRemove = "Drug Name\n" +
	"Drug Tier\n" +
	"Prior Authorization\n" +
	"Step Therapy\n" +
	"Dispensing Limits\n" +
	"ACA\n" +
	"Limited Distribution\n";

	text = text.Replace(textToRemove, string.Empty);

	// Remove all dots
	text = text.Replace("•", string.Empty);

	// Remember where all new lines were
	text = text.Replace("\n", " $ ");

	char[] separators = { ' ' };
	var words = text.Split(separators);

	//var drugList = new List<Tuple<string, string>>();
	var drugs = new List<(string name, string tier)>();

	// var counter = 0;

	for (var index = 0; index < words.Length;)
	{
	// Find first word that isnt all caps.
	// Can be lowercase or just a '-' (which is included in all drugs that start with upper case words)
	var stringBuilder = new StringBuilder();

	// Ignoring all words "$" which marks newlines
	for (; words[index].Equals("$") \|\| index < words.Length && words[index].IsUpper(); ++index)
	{ }

	if (index >= words.Length)
	break;

	var retrieveWordsUpToIndex = -1;

	// Weird rule: Take all words from the first line moving backwards that contains a word with alphabetic character and length > 1.
	if (words[index].Equals("-"))
	{
	retrieveWordsUpToIndex = index;

	// Go backwards and find first word with alphabetic char in it.
	for (index -= 1; words[index].ContainsAlphabeticCharacter(); index--)
	{ }

	// Go backwards until at start of current line
	for (; !words[index - 1].Equals("$"); index--)
	{ }
	}

	for (; index <= retrieveWordsUpToIndex; index++)
	{
	if (words[index].Equals("$"))
	continue;

	stringBuilder.Append(words[index]);
	stringBuilder.Append(' ');
	}

	// Apparently checks if word is a digit
	for (; index < words.Length && (!IsTier(words, index) \|\| words[index].Equals("$")); ++index)
	{
	if (words[index].Equals("$"))
	continue;

	stringBuilder.Append(words[index]);
	stringBuilder.Append(' ');
	}

	if (index >= words.Length)
	break;

	//drugs.Add(new Tuple<string, string>(stringBuilder.ToString(), words[index]));
	drugs.Add((stringBuilder.ToString(), words[index]));

	stringBuilder.Clear();

	// i should currently point at the tier for a drug and should thus be incremented once
	++index;
	}

	var sb = new StringBuilder();

	foreach (var drug in drugs)
	{
	//sB.AppendLine("\"" + drug.Item1 + "\",\"" + drug.Item2 + "\"");
	sb.AppendLine("\"" + drug.name + "\",\"" + drug.tier + "\"");
	}

	// Add any drugs from the 'tail' of the table that was cut off

	sb.AppendLine("\"" + "ZORTRESS - everolimus tab 0.75 mg" + "\",\"" + "4" + "\"");

	System.IO.File.WriteAllText(basePath + csvFilename, sb.ToString());

	Console.WriteLine("Extracted information from " + drugs.Count + " drugs.");
	Console.ReadKey();
	}


	private static bool IsTier(IReadOnlyList<string> words, int index)
	{
	string[] tiers = { "1", "2", "3", "4", "5", "1,2", "A" };

	// Special case for last entry in pdf table
	if (index == words.Count - 1)
	if (words[index - 1].Equals("$"))
	return tiers.Contains(words[index]);

	// Normal case
	if (words[index - 1].Equals("$") && words[index + 1].Equals("$"))
	return tiers.Contains(words[index]);
	return false;
	}



	private static string ReadPdfAsText(PdfReader pdfReader, int firstPage, int lastPage)
	{
	var stringBuilder = new StringBuilder();

	for (var currentPage = firstPage; currentPage <= lastPage; currentPage++)
	{
	var simpleTextExtractionStrategy = new SimpleTextExtractionStrategy();
	var pageText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, simpleTextExtractionStrategy);

	pageText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(pageText)));
	stringBuilder.Append(pageText);
	}

	return stringBuilder.ToString();
	}

	}
	}