aabiddanda/.flake8

## .flake8
[flake8]
max-line-length = 120
exclude=
  .git,
  .github,
  env,
  venv,
  build,
  dist
ignore=
  # Block comment should start with '# '
  # Not if it's a commented out line
  E265,

  # Ambiguous variable names
  # It's absolutely fine to have i and I
  E741,

  # List comprehension redefines variable
  # Re-using throw-away variables like `i`, `x`, etc. is a Good Idea
  F812,

  # Blank line at end of file
  # This increases readability
  W391,

  # Line break before binary operator
  # This is now actually advised in pep8
  W503,

  # Line break after binary operator
  W504,

## .pre-commit-config.yaml
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v2.4.0
    hooks:
    -   id: trailing-whitespace
    -   id: end-of-file-fixer
    -   id: check-yaml
    -   id: check-added-large-files
        args: ['--maxkb=900']
-   repo: https://github.com/psf/black
    rev: 19.3b0
    hooks:
    -   id: black
-   repo: https://github.com/pycqa/pydocstyle
    rev: 4.0.0  # pick a git hash / tag to point to
    hooks:
    -   id: pydocstyle
-   repo: https://gitlab.com/pycqa/flake8
    rev: 3.7.9
    hooks:
    - id: flake8

## pairwise_diff_uniparental.py
#!/usr/local/env python3

import click
import numpy as np
import pandas as pd
from cyvcf2 import VCF
from tqdm import tqdm
import warnings


def create_population_pairs(pops, ignore_pops=[]):
    """Create unique pairs of populations."""
    assert pops.size > 0
    assert np.unique(pops).size > 1
    uniq_pops = np.unique(pops)
    uniq_pops = [p for p in uniq_pops if p not in ignore_pops]
    pop_pairs = []
    for p1 in uniq_pops:
        for p2 in uniq_pops:
            if p1 != p2:
                if ((p2, p1) not in pop_pairs) and ((p1, p2) not in pop_pairs):
                    pop_pairs.append((p1, p2))
    return pop_pairs


def verify_vcf_samples(vcf_file, indIDs, **kwargs):
    """Verify samples that are in the VCF"""
    vcf = VCF(vcf_file, **kwargs)
    for samp in vcf.samples:
        if samp not in indIDs:
            warnings.warn(f"Sample {samp} is not in VCF file {vcf_file}!")


def read_vcf(vcf_file, qual_filter=30, **kwargs):
    """Function to read VCF snp-by-snp as a numpy array."""
    vcf = VCF(vcf_file, **kwargs)
    samples = np.asarray(vcf.samples)
    geno = []
    for v in vcf:
        if v.QUAL > qual_filter:
            geno.append(v.gt_types.copy())
    geno = np.vstack(geno)
    return geno, samples


def extract_pops(geno, samples, pop_samples):
    """Extract populations from the VCF file."""
    assert geno.shape[1] == samples.size
    idxs = []
    valid_samples = []
    for sp in pop_samples:
        if sp not in samples:
            warnings.warn(f"sample {sp} not in VCF!")
        else:
            idxs.append(np.where(samples == sp)[0][0])
            valid_samples.append(sp)

    idxs = np.array(idxs)
    if idxs.size < 2:
        raise ValueError("Need at least two samples to calculate pairwise differences.")
    return geno[:, idxs], np.array(valid_samples)


def haplotype_diversity(haps, vcf_file, treat_missing=False, **kwargs):
    """Calculate the haplotypic diversity. """
    assert haps.ndim == 2
    assert haps.shape[1] > 2
    vcf = VCF(vcf_file, **kwargs)
    test_haps = (haps != 0) & (haps != vcf.UNKNOWN)
    uniq_vals, uniq_cnts = np.unique(test_haps, axis=1, return_counts=True)
    hap_freq = uniq_cnts / np.sum(uniq_cnts)
    n_haps = uniq_cnts.size
    n = haps.shape[1]
    hap_div = (n / (n - 1)) * (1.0 - np.sum(hap_freq ** 2))
    return hap_div, n_haps


def pairwise_diff(haps, vcf_file, treat_missing=False, **kwargs):
    """Compute pairwise differences in aggregate."""
    assert haps.ndim == 2
    assert haps.shape[1] > 1
    vcf = VCF(vcf_file, **kwargs)
    n_pop = haps.shape[1]

    pairwise_diff = []
    for i in np.arange(n_pop):
        for j in np.arange(n_pop):
            if i != j:
                if treat_missing:
                    hap1 = haps[:, i]
                    hap2 = haps[:, j]
                    missing = (hap1 == vcf.UNKNOWN) | (hap2 == vcf.UNKNOWN)
                    pairwise_diff.append(np.sum(np.abs(hap1 - hap2), where=~missing) / np.sum(~missing))
                else:
                    pairwise_diff.append(np.sum(np.abs(haps[:, i] - haps[:, j])) / haps.shape[0])
    return np.asarray(pairwise_diff)


def bootstrap_pairwise_differences(
    haps_combined, vcf_file, num_reps=10, treat_missing=False, **kwargs
):
    """Compute boostrapped pairwise differences."""
    assert haps_combined.ndim == 2
    assert haps_combined.shape[1] > 2
    n_ind = haps_combined.shape[1]
    # print(f"N={n_ind}, M={haps_combined.shape[1]}")
    p1 = np.zeros(num_reps)
    p2 = np.zeros(num_reps)

    for i in tqdm(range(num_reps)):
        # Randomly sample from the collected set of both samples ...
        idx = np.random.choice(n_ind, replace=False, size=int(n_ind / 2))
        hap1 = haps_combined[:, idx]
        hap2 = haps_combined[:, ~idx]
        diff1 = pairwise_diff(hap1, vcf_file, treat_missing=treat_missing, **kwargs)
        diff2 = pairwise_diff(hap2, vcf_file, treat_missing=treat_missing, **kwargs)
        p1[i] = np.mean(diff1)
        p2[i] = np.mean(diff2)
    # Setup the full set of pairwise differences
    pairwise_distance = np.abs(p1 - p2)
    return p1, p2, pairwise_distance


def bootstrap_haplotype_diversity(
    haps_combined, vcf_file, num_reps, treat_missing=False, **kwargs
):
    """Compute bootstrapped haplotype diversity statistics."""
    assert haps_combined.ndim == 2
    assert haps_combined.shape[1] > 2
    n_ind = haps_combined.shape[1]
    # print(f"N={n_ind}, M={haps_combined.shape[1]}")
    num_haps1 = np.zeros(num_reps)
    num_haps2 = np.zeros(num_reps)
    hap_div1 = np.zeros(num_reps)
    hap_div2 = np.zeros(num_reps)
    for i in tqdm(range(num_reps)):
        # Randomly sample from the collected set of both samples ...
        idx = np.random.choice(n_ind, replace=False, size=int(n_ind / 2))
        hap1 = haps_combined[:, idx]
        hap2 = haps_combined[:, ~idx]
        cur_hap_div1, n_hap1 = haplotype_diversity(
            hap1, vcf_file, treat_missing=treat_missing, **kwargs
        )
        cur_hap_div2, n_hap2 = haplotype_diversity(
            hap2, vcf_file, treat_missing=treat_missing, **kwargs
        )
        num_haps1[i] = n_hap1
        num_haps2[i] = n_hap2
        hap_div1[i] = cur_hap_div1
        hap_div2[i] = cur_hap_div2
    # Setup the full set of pairwise differences
    pairwise_n_hap = np.abs(num_haps1 - num_haps2)
    pairwise_hap_div = np.abs(hap_div1 - hap_div2)
    return num_haps1, num_haps2, pairwise_n_hap, hap_div1, hap_div2, pairwise_hap_div


def create_pairwise_diff_df(
    vcf, poplist, ignore_pops, missing_correct, nreps, seed, out
):
    """Create a data frame of the pairwise differences."""
    assert seed > 0
    assert nreps > 0
    np.random.seed(seed)
    pop_df = pd.read_csv(poplist, sep="\s+", dtype=str)  # noqa
    verify_vcf_samples(vcf, pop_df.indIDs.values, lazy=True)
    pops_to_ignore = ignore_pops.split(",")
    geno, samples = read_vcf(vcf, strict_gt=True)
    assert geno.ndim > 1
    popA = []
    popB = []
    reps = []
    piA = []
    piB = []
    pi_diff = []
    pop_pairs = create_population_pairs(pop_df.popIDs.values, pops_to_ignore)
    for (p1, p2) in tqdm(pop_pairs):
        p1_samples = pop_df.indIDs[pop_df.popIDs == p1].values
        p2_samples = pop_df.indIDs[pop_df.popIDs == p2].values
        # print(p1,p2, p1_samples.size, p2_samples.size)
        haps_p1, _ = extract_pops(geno, samples, p1_samples)
        haps_p2, _ = extract_pops(geno, samples, p2_samples)
        full_pi1 = pairwise_diff(
            haps_p1, vcf, treat_missing=missing_correct, strict_gt=True, gts012=True
        )
        full_pi2 = pairwise_diff(
            haps_p2, vcf, treat_missing=missing_correct, strict_gt=True, gts012=True
        )
        full_mean_pi1 = np.mean(full_pi1)
        full_mean_pi2 = np.mean(full_pi2)
        full_diff = np.abs(np.mean(full_pi1) - np.mean(full_pi2))
        # Running the bootstrapped pairwise_diff
        hap_combined = np.hstack([haps_p1, haps_p2])
        pi_p1, pi_p2, cur_pi_diff = bootstrap_pairwise_differences(
            hap_combined,
            vcf,
            num_reps=nreps,
            treat_missing=missing_correct,
            strict_gt=True,
        )
        pi_p1 = np.insert(pi_p1, 0, full_mean_pi1, axis=0)
        pi_p2 = np.insert(pi_p2, 0, full_mean_pi2, axis=0)
        cur_pi_diff = np.insert(cur_pi_diff, 0, full_diff, axis=0)
        reps.append(np.arange(0, nreps + 1))
        popA.append(np.repeat(p1, nreps + 1))
        popB.append(np.repeat(p2, nreps + 1))
        piA.append(pi_p1)
        piB.append(pi_p2)
        pi_diff.append(cur_pi_diff)
    popA = np.concatenate(popA)
    popB = np.concatenate(popB)
    reps = np.concatenate(reps)
    piA = np.concatenate(piA)
    piB = np.concatenate(piB)
    pi_diff = np.concatenate(pi_diff)
    test_df = pd.DataFrame(
        {
            "popA": popA,
            "popB": popB,
            "reps": reps,
            "piA": piA,
            "piB": piB,
            "pi_diff": pi_diff,
        }
    )
    return test_df


def create_pairwise_hap_df(
    vcf, poplist, ignore_pops, missing_correct, nreps, seed, out
):
    """Create a data frame of the haplotype diversity."""
    assert seed > 0
    assert nreps > 0
    np.random.seed(seed)
    pop_df = pd.read_csv(poplist, sep="\s+", dtype=str)  # noqa
    verify_vcf_samples(vcf, pop_df.indIDs.values, lazy=True)
    pops_to_ignore = ignore_pops.split(",")
    geno, samples = read_vcf(vcf, strict_gt=True, gts012=True)
    assert geno.ndim > 1
    popA = []
    popB = []
    reps = []
    n_hapA = []
    n_hapB = []
    n_hap_paired = []
    hap_divA = []
    hap_divB = []
    hap_div_paired = []
    pop_pairs = create_population_pairs(pop_df.popIDs.values, pops_to_ignore)
    for (p1, p2) in tqdm(pop_pairs):
        print(p1,p2)
        p1_samples = pop_df.indIDs[pop_df.popIDs == p1].values
        p2_samples = pop_df.indIDs[pop_df.popIDs == p2].values
        # print(p1, p2, p1_samples.size, p2_samples.size)
        haps_p1, _ = extract_pops(geno, samples, p1_samples)
        haps_p2, _ = extract_pops(geno, samples, p2_samples)
        full_hap_div1, full_n_hap1 = haplotype_diversity(
            haps_p1, vcf, treat_missing=missing_correct, strict_gt=True, gts012=True
        )
        full_hap_div2, full_n_hap2 = haplotype_diversity(
            haps_p2, vcf, treat_missing=missing_correct, strict_gt=True, gts012=True
        )
        full_hap_diff = np.abs(full_hap_div1 - full_hap_div2)
        full_n_hap_diff = np.abs(full_n_hap1 - full_n_hap2)
        # Running the bootstrapped pairwise_diff
        hap_combined = np.hstack([haps_p1, haps_p2])
        num_haps1, num_haps2, pairwise_n_hap, hap_div1, hap_div2, pairwise_hap_div = bootstrap_haplotype_diversity(
            hap_combined,
            vcf,
            num_reps=nreps,
            treat_missing=missing_correct,
            strict_gt=True,
        )
        num_haps1 = np.insert(num_haps1, 0, full_n_hap1, axis=0)
        num_haps2 = np.insert(num_haps2, 0, full_n_hap2, axis=0)
        hap_div1 = np.insert(hap_div1, 0, full_hap_div1, axis=0)
        hap_div2 = np.insert(hap_div2, 0, full_hap_div2, axis=0)
        pairwise_n_hap = np.insert(pairwise_n_hap, 0, full_n_hap_diff, axis=0)
        pairwise_hap_div = np.insert(pairwise_hap_div, 0, full_hap_diff, axis=0)
        reps.append(np.arange(0, nreps + 1))
        popA.append(np.repeat(p1, nreps + 1))
        popB.append(np.repeat(p2, nreps + 1))
        n_hapA.append(num_haps1)
        n_hapB.append(num_haps2)
        hap_divA.append(hap_div1)
        hap_divB.append(hap_div2)
        n_hap_paired.append(pairwise_n_hap)
        hap_div_paired.append(pairwise_hap_div)

    popA = np.concatenate(popA)
    popB = np.concatenate(popB)
    reps = np.concatenate(reps)
    nhapA = np.concatenate(n_hapA)
    nhapB = np.concatenate(n_hapB)
    nhap_paired = np.concatenate(n_hap_paired)
    hapdivA = np.concatenate(hap_divA)
    hapdivB = np.concatenate(hap_divB)
    hapdiv_paired = np.concatenate(hap_div_paired)
    test_df = pd.DataFrame(
        {
            "popA": popA,
            "popB": popB,
            "reps": reps,
            "n_hapA": nhapA,
            "n_hapB": nhapB,
            "n_hap_paired": nhap_paired,
            "hap_divA": hapdivA,
            "hap_divB": hapdivB,
            "hap_div_paired": hapdiv_paired,
        }
    )
    return test_df


@click.command()
@click.option("--vcf", "-v", help="Input VCF file.")
@click.option("--poplist", "-p", help="List of populations for each sample in the VCF")
@click.option("--ignore_pops", "-i", type=str, default="", help="Populations to ignore")
@click.option(
    "--missing_correct",
    "-c",
    type=bool,
    default=True,
    help="Correction for missingness in pairwise differences.",
)
@click.option(
    "--nreps", "-n", type=int, default=100, help="Number of bootstrap replicates."
)
@click.option(
    "--mode", "-m", default="pairwise_diff", help="Mode for statistics computed."
)
@click.option("--seed", "-s", type=int, default=42, help="Random number seed.")
@click.option("--out", "-o", default="out.csv", help="Output CSV")
def main(vcf, poplist, ignore_pops, missing_correct, nreps, mode, seed, out):
    """Run the main function for running scripted analyses."""
    if mode == "pairwise_diff":
        test_df = create_pairwise_diff_df(
            vcf, poplist, ignore_pops, missing_correct, nreps, seed, out
        )
    elif mode == "haplotype_diff":
        test_df = create_pairwise_hap_df(
            vcf, poplist, ignore_pops, missing_correct, nreps, seed, out
        )
    else:
        raise ValueError(f"{mode} is not a valid mode!")
    # Write out the files here ...
    test_df.to_csv(out, index=False)


if __name__ == "__main__":
    main()

## pixy_pairwise_diff.py
#!/usr/local/env python3

import click
import numpy as np
import pandas as pd
from cyvcf2 import VCF
from tqdm import tqdm
import subprocess
import tempfile
import warnings


def create_population_pairs(pops, ignore_pops=[]):
    """Create unique pairs of populations."""
    assert pops.size > 0
    assert np.unique(pops).size > 1
    uniq_pops = np.unique(pops)
    uniq_pops = [p for p in uniq_pops if p not in ignore_pops]
    pop_pairs = []
    for p1 in uniq_pops:
        for p2 in uniq_pops:
            if p1 != p2:
                if ((p2, p1) not in pop_pairs) and ((p1, p2) not in pop_pairs):
                    pop_pairs.append((p1, p2))
    return pop_pairs


def verify_vcf_samples(vcf_file, indIDs, **kwargs):
    """Verify samples that are in the VCF"""
    vcf = VCF(vcf_file, **kwargs)
    for samp in vcf.samples:
        if samp not in indIDs:
            warnings.warn(f"Sample {samp} is not in VCF file {vcf_file}!")

def read_vcf_samples(vcf_file, qual_filter=30, **kwargs):
    """Function to read VCF snp-by-snp as a numpy array."""
    vcf = VCF(vcf_file, **kwargs)
    samples = np.asarray(vcf.samples)
    geno = []
    for v in vcf:
        if v.QUAL > qual_filter:
            geno.append(v.gt_types.copy())
    geno = np.vstack(geno)
    return geno, samples

def shuffle_pops(pop_df, tmp_file, pops=['A', 'B'], seed=42):
    """Shuffle the populations in question."""
    assert seed > 0
    np.random.seed(seed)
    pop_vals = pop_df.popIDs.values
    idx = np.where([x in pops for x in pop_vals])[0]
    idx_shuff = np.random.choice(idx, size=idx.size, replace=False)
    pop_vals[idx] = pop_vals[idx_shuff]
    pop_df['popIDs'] = pop_vals
    pop_df.to_csv(tmp_file, sep="\t", index=False, header=False)

def run_pixy(vcf_file, pop_file, out_prefix, chrom="MT", seq_length=100000, skip_invariant=False):
    """Run pixy to estimate the pairwise differences."""
    if skip_invariant:
        subprocess.call(['pixy', '--stats', 'pi', '--vcf', vcf_file, '--populations', pop_file, '--n_cores', '4', '--chromosomes', str(chrom), '--window_size', str(seq_length), '--bypass_invariant_check', 'yes', '--output_prefix', out_prefix])
    else:
         subprocess.call(['pixy', '--stats', 'pi', '--vcf', vcf_file, '--populations', pop_file, '--n_cores', '4', '--chromosomes', str(chrom), '--window_size', str(seq_length),  '--output_prefix', out_prefix])


@click.command()
@click.option("--vcf", "-v", help="Input VCF file.")
@click.option("--poplist", "-p", help="List of populations for each sample in the VCF")
@click.option("--ignore_pops", "-i", type=str, default="", help="Populations to ignore")
@click.option("--chrom", "-c", type=str, default="MT", help="Chromosome")
@click.option("--seq_length", "-l", type=int, default=16569, help="Chromosome Length")
@click.option(
    "--nreps", "-n", type=int, default=100, help="Number of bootstrap replicates."
)
@click.option('--skip', type=bool, default=False, help="Skip invariant-sites adjustments")
@click.option("--out", "-o", type=str, default=42, help="Output prefix.")
def main(vcf, poplist, ignore_pops, chrom, seq_length, nreps, skip, out):
    """Run the main function for running scripted analyses."""
    # Read the poplists here ...
    pop_df = pd.read_csv(poplist, sep="\s+")
    pop_pairs = create_population_pairs(pop_df.popIDs.values, ignore_pops)
    for (p1,p2) in tqdm(pop_pairs):
        # Create a temporary file for the population data-frame...
        temp = tempfile.NamedTemporaryFile()
        pop_df.to_csv(temp.name, sep="\t", header=None, index=None)
        run_pixy(vcf_file=vcf, pop_file=temp.name, out_prefix=f'{out}_{p1}_{p2}_true', chrom=chrom, seq_length=seq_length, skip_invariant=skip)
        for i in range(nreps):
            temp2 = tempfile.NamedTemporaryFile()
            cur_df = pop_df.copy()
            shuffle_pops(cur_df, temp2.name, pops=[p1, p2], seed=i+1)
            run_pixy(vcf_file=vcf, pop_file=temp2.name, out_prefix=f'{out}_{p1}_{p2}_rep{i+1}', chrom=chrom, seq_length=seq_length, skip_invariant=skip)


if __name__ == "__main__":
    main()

## test.output.csv

          
            popA
            popB
            reps
            piA
            piB
            pi_diff

            
              A
              B
              1
              0.5
              0.5
              0.0

            
              A
              B
              2
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              3
              0.5
              0.5
              0.0

            
              A
              B
              4
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              5
              0.5
              1.0
              0.5

            
              A
              B
              6
              0.6666666666666666
              0.5
              0.16666666666666663

            
              A
              B
              7
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              8
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              9
              0.5
              1.0
              0.5

            
              A
              B
              10
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              11
              0.5
              0.5
              0.0

            
              A
              B
              12
              0.5
              1.0
              0.5

            
              A
              B
              13
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              14
              0.5
              1.0
              0.5

            
              A
              B
              15
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              16
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              17
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              18
              0.5
              0.5
              0.0

            
              A
              B
              19
              0.5
              0.5
              0.0

            
              A
              B
              20
              1.0
              1.0
              0.0

            
              A
              B
              21
              1.0
              1.0
              0.0

            
              A
              B
              22
              0.5
              0.5
              0.0

            
              A
              B
              23
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              24
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              25
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              26
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              27
              0.5
              0.5
              0.0

            
              A
              B
              28
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              29
              0.5
              1.0
              0.5

            
              A
              B
              30
              1.0
              1.0
              0.0

            
              A
              B
              31
              1.0
              0.5
              0.5

            
              A
              B
              32
              0.5
              0.5
              0.0

            
              A
              B
              33
              0.5
              0.5
              0.0

            
              A
              B
              34
              0.5
              0.5
              0.0

            
              A
              B
              35
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              36
              0.6666666666666666
              0.5
              0.16666666666666663

            
              A
              B
              37
              1.0
              1.0
              0.0

            
              A
              B
              38
              0.5
              0.5
              0.0

            
              A
              B
              39
              0.5
              1.0
              0.5

            
              A
              B
              40
              0.5
              0.5
              0.0

            
              A
              B
              41
              0.5
              0.5
              0.0

            
              A
              B
              42
              1.0
              0.5
              0.5

            
              A
              B
              43
              0.5
              0.5
              0.0

            
              A
              B
              44
              0.5
              1.0
              0.5

            
              A
              B
              45
              0.5
              0.5
              0.0

            
              A
              B
              46
              0.5
              1.0
              0.5

            
              A
              B
              47
              1.0
              1.0
              0.0

            
              A
              B
              48
              0.5
              1.0
              0.5

            
              A
              B
              49
              1.0
              1.0
              0.0

            
              A
              B
              50
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              51
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              52
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              53
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              54
              1.0
              0.5
              0.5

            
              A
              B
              55
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              56
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              57
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              58
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              59
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              60
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              61
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              62
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              63
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              64
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              65
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              66
              0.5
              1.0
              0.5

            
              A
              B
              67
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              68
              0.6666666666666666
              0.5
              0.16666666666666663

            
              A
              B
              69
              1.0
              0.5
              0.5

            
              A
              B
              70
              0.6666666666666666
              0.5
              0.16666666666666663

            
              A
              B
              71
              0.5
              0.5
              0.0

            
              A
              B
              72
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              73
              1.0
              0.5
              0.5

            
              A
              B
              74
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              75
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              76
              0.5
              0.5
              0.0

            
              A
              B
              77
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              78
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              79
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              80
              1.0
              0.5
              0.5

            
              A
              B
              81
              0.6666666666666666
              0.5
              0.16666666666666663

            
              A
              B
              82
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              83
              0.5
              0.5
              0.0

            
              A
              B
              84
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              85
              0.5
              0.5
              0.0

            
              A
              B
              86
              0.6666666666666666
              0.5
              0.16666666666666663

            
              A
              B
              87
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              88
              1.0
              1.0
              0.0

            
              A
              B
              89
              0.6666666666666666
              0.5
              0.16666666666666663

            
              A
              B
              90
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              91
              0.6666666666666666
              1.0
              0.33333333333333337

            
              A
              B
              92
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              93
              0.6666666666666666
              0.5
              0.16666666666666663

            
              A
              B
              94
              0.5
              1.0
              0.5

            
              A
              B
              95
              0.5
              0.5
              0.0

            
              A
              B
              96
              0.3333333333333333
              0.5
              0.16666666666666669

            
              A
              B
              97
              0.3333333333333333
              1.0
              0.6666666666666667

            
              A
              B
              98
              0.5
              1.0
              0.5

            
              A
              B
              99
              0.5
              0.5
              0.0

            
              A
              B
              100
              0.5
              0.5
              0.0

## test.pops
indIDs popIDs
A1 A
A2 A
A3 A
B1 B
B2 B
C C

## test.vcf.gz.tbi

      
    Raw
  

              test.vcf.gz.tbi
            
          
            View raw
	[flake8]
	max-line-length = 120
	exclude=
	.git,
	.github,
	env,
	venv,
	build,
	dist
	ignore=
	# Block comment should start with '# '
	# Not if it's a commented out line
	E265,

	# Ambiguous variable names
	# It's absolutely fine to have i and I
	E741,

	# List comprehension redefines variable
	# Re-using throw-away variables like `i`, `x`, etc. is a Good Idea
	F812,

	# Blank line at end of file
	# This increases readability
	W391,

	# Line break before binary operator
	# This is now actually advised in pep8
	W503,

	# Line break after binary operator
	W504,
	# See https://pre-commit.com for more information
	# See https://pre-commit.com/hooks.html for more hooks
	repos:
	- repo: https://github.com/pre-commit/pre-commit-hooks
	rev: v2.4.0
	hooks:
	- id: trailing-whitespace
	- id: end-of-file-fixer
	- id: check-yaml
	- id: check-added-large-files
	args: ['--maxkb=900']
	- repo: https://github.com/psf/black
	rev: 19.3b0
	hooks:
	- id: black
	- repo: https://github.com/pycqa/pydocstyle
	rev: 4.0.0 # pick a git hash / tag to point to
	hooks:
	- id: pydocstyle
	- repo: https://gitlab.com/pycqa/flake8
	rev: 3.7.9
	hooks:
	- id: flake8
	#!/usr/local/env python3

	import click
	import numpy as np
	import pandas as pd
	from cyvcf2 import VCF
	from tqdm import tqdm
	import warnings


	def create_population_pairs(pops, ignore_pops=[]):
	"""Create unique pairs of populations."""
	assert pops.size > 0
	assert np.unique(pops).size > 1
	uniq_pops = np.unique(pops)
	uniq_pops = [p for p in uniq_pops if p not in ignore_pops]
	pop_pairs = []
	for p1 in uniq_pops:
	for p2 in uniq_pops:
	if p1 != p2:
	if ((p2, p1) not in pop_pairs) and ((p1, p2) not in pop_pairs):
	pop_pairs.append((p1, p2))
	return pop_pairs


	def verify_vcf_samples(vcf_file, indIDs, **kwargs):
	"""Verify samples that are in the VCF"""
	vcf = VCF(vcf_file, **kwargs)
	for samp in vcf.samples:
	if samp not in indIDs:
	warnings.warn(f"Sample {samp} is not in VCF file {vcf_file}!")


	def read_vcf(vcf_file, qual_filter=30, **kwargs):
	"""Function to read VCF snp-by-snp as a numpy array."""
	vcf = VCF(vcf_file, **kwargs)
	samples = np.asarray(vcf.samples)
	geno = []
	for v in vcf:
	if v.QUAL > qual_filter:
	geno.append(v.gt_types.copy())
	geno = np.vstack(geno)
	return geno, samples


	def extract_pops(geno, samples, pop_samples):
	"""Extract populations from the VCF file."""
	assert geno.shape[1] == samples.size
	idxs = []
	valid_samples = []
	for sp in pop_samples:
	if sp not in samples:
	warnings.warn(f"sample {sp} not in VCF!")
	else:
	idxs.append(np.where(samples == sp)[0][0])
	valid_samples.append(sp)

	idxs = np.array(idxs)
	if idxs.size < 2:
	raise ValueError("Need at least two samples to calculate pairwise differences.")
	return geno[:, idxs], np.array(valid_samples)


	def haplotype_diversity(haps, vcf_file, treat_missing=False, **kwargs):
	"""Calculate the haplotypic diversity. """
	assert haps.ndim == 2
	assert haps.shape[1] > 2
	vcf = VCF(vcf_file, **kwargs)
	test_haps = (haps != 0) & (haps != vcf.UNKNOWN)
	uniq_vals, uniq_cnts = np.unique(test_haps, axis=1, return_counts=True)
	hap_freq = uniq_cnts / np.sum(uniq_cnts)
	n_haps = uniq_cnts.size
	n = haps.shape[1]
	hap_div = (n / (n - 1)) * (1.0 - np.sum(hap_freq ** 2))
	return hap_div, n_haps


	def pairwise_diff(haps, vcf_file, treat_missing=False, **kwargs):
	"""Compute pairwise differences in aggregate."""
	assert haps.ndim == 2
	assert haps.shape[1] > 1
	vcf = VCF(vcf_file, **kwargs)
	n_pop = haps.shape[1]

	pairwise_diff = []
	for i in np.arange(n_pop):
	for j in np.arange(n_pop):
	if i != j:
	if treat_missing:
	hap1 = haps[:, i]
	hap2 = haps[:, j]
	missing = (hap1 == vcf.UNKNOWN) \| (hap2 == vcf.UNKNOWN)
	pairwise_diff.append(np.sum(np.abs(hap1 - hap2), where=~missing) / np.sum(~missing))
	else:
	pairwise_diff.append(np.sum(np.abs(haps[:, i] - haps[:, j])) / haps.shape[0])
	return np.asarray(pairwise_diff)


	def bootstrap_pairwise_differences(
	haps_combined, vcf_file, num_reps=10, treat_missing=False, **kwargs
	):
	"""Compute boostrapped pairwise differences."""
	assert haps_combined.ndim == 2
	assert haps_combined.shape[1] > 2
	n_ind = haps_combined.shape[1]
	# print(f"N={n_ind}, M={haps_combined.shape[1]}")
	p1 = np.zeros(num_reps)
	p2 = np.zeros(num_reps)

	for i in tqdm(range(num_reps)):
	# Randomly sample from the collected set of both samples ...
	idx = np.random.choice(n_ind, replace=False, size=int(n_ind / 2))
	hap1 = haps_combined[:, idx]
	hap2 = haps_combined[:, ~idx]
	diff1 = pairwise_diff(hap1, vcf_file, treat_missing=treat_missing, **kwargs)
	diff2 = pairwise_diff(hap2, vcf_file, treat_missing=treat_missing, **kwargs)
	p1[i] = np.mean(diff1)
	p2[i] = np.mean(diff2)
	# Setup the full set of pairwise differences
	pairwise_distance = np.abs(p1 - p2)
	return p1, p2, pairwise_distance


	def bootstrap_haplotype_diversity(
	haps_combined, vcf_file, num_reps, treat_missing=False, **kwargs
	):
	"""Compute bootstrapped haplotype diversity statistics."""
	assert haps_combined.ndim == 2
	assert haps_combined.shape[1] > 2
	n_ind = haps_combined.shape[1]
	# print(f"N={n_ind}, M={haps_combined.shape[1]}")
	num_haps1 = np.zeros(num_reps)
	num_haps2 = np.zeros(num_reps)
	hap_div1 = np.zeros(num_reps)
	hap_div2 = np.zeros(num_reps)
	for i in tqdm(range(num_reps)):
	# Randomly sample from the collected set of both samples ...
	idx = np.random.choice(n_ind, replace=False, size=int(n_ind / 2))
	hap1 = haps_combined[:, idx]
	hap2 = haps_combined[:, ~idx]
	cur_hap_div1, n_hap1 = haplotype_diversity(
	hap1, vcf_file, treat_missing=treat_missing, **kwargs
	)
	cur_hap_div2, n_hap2 = haplotype_diversity(
	hap2, vcf_file, treat_missing=treat_missing, **kwargs
	)
	num_haps1[i] = n_hap1
	num_haps2[i] = n_hap2
	hap_div1[i] = cur_hap_div1
	hap_div2[i] = cur_hap_div2
	# Setup the full set of pairwise differences
	pairwise_n_hap = np.abs(num_haps1 - num_haps2)
	pairwise_hap_div = np.abs(hap_div1 - hap_div2)
	return num_haps1, num_haps2, pairwise_n_hap, hap_div1, hap_div2, pairwise_hap_div


	def create_pairwise_diff_df(
	vcf, poplist, ignore_pops, missing_correct, nreps, seed, out
	):
	"""Create a data frame of the pairwise differences."""
	assert seed > 0
	assert nreps > 0
	np.random.seed(seed)
	pop_df = pd.read_csv(poplist, sep="\s+", dtype=str) # noqa
	verify_vcf_samples(vcf, pop_df.indIDs.values, lazy=True)
	pops_to_ignore = ignore_pops.split(",")
	geno, samples = read_vcf(vcf, strict_gt=True)
	assert geno.ndim > 1
	popA = []
	popB = []
	reps = []
	piA = []
	piB = []
	pi_diff = []
	pop_pairs = create_population_pairs(pop_df.popIDs.values, pops_to_ignore)
	for (p1, p2) in tqdm(pop_pairs):
	p1_samples = pop_df.indIDs[pop_df.popIDs == p1].values
	p2_samples = pop_df.indIDs[pop_df.popIDs == p2].values
	# print(p1,p2, p1_samples.size, p2_samples.size)
	haps_p1, _ = extract_pops(geno, samples, p1_samples)
	haps_p2, _ = extract_pops(geno, samples, p2_samples)
	full_pi1 = pairwise_diff(
	haps_p1, vcf, treat_missing=missing_correct, strict_gt=True, gts012=True
	)
	full_pi2 = pairwise_diff(
	haps_p2, vcf, treat_missing=missing_correct, strict_gt=True, gts012=True
	)
	full_mean_pi1 = np.mean(full_pi1)
	full_mean_pi2 = np.mean(full_pi2)
	full_diff = np.abs(np.mean(full_pi1) - np.mean(full_pi2))
	# Running the bootstrapped pairwise_diff
	hap_combined = np.hstack([haps_p1, haps_p2])
	pi_p1, pi_p2, cur_pi_diff = bootstrap_pairwise_differences(
	hap_combined,
	vcf,
	num_reps=nreps,
	treat_missing=missing_correct,
	strict_gt=True,
	)
	pi_p1 = np.insert(pi_p1, 0, full_mean_pi1, axis=0)
	pi_p2 = np.insert(pi_p2, 0, full_mean_pi2, axis=0)
	cur_pi_diff = np.insert(cur_pi_diff, 0, full_diff, axis=0)
	reps.append(np.arange(0, nreps + 1))
	popA.append(np.repeat(p1, nreps + 1))
	popB.append(np.repeat(p2, nreps + 1))
	piA.append(pi_p1)
	piB.append(pi_p2)
	pi_diff.append(cur_pi_diff)
	popA = np.concatenate(popA)
	popB = np.concatenate(popB)
	reps = np.concatenate(reps)
	piA = np.concatenate(piA)
	piB = np.concatenate(piB)
	pi_diff = np.concatenate(pi_diff)
	test_df = pd.DataFrame(
	{
	"popA": popA,
	"popB": popB,
	"reps": reps,
	"piA": piA,
	"piB": piB,
	"pi_diff": pi_diff,
	}
	)
	return test_df


	def create_pairwise_hap_df(
	vcf, poplist, ignore_pops, missing_correct, nreps, seed, out
	):
	"""Create a data frame of the haplotype diversity."""
	assert seed > 0
	assert nreps > 0
	np.random.seed(seed)
	pop_df = pd.read_csv(poplist, sep="\s+", dtype=str) # noqa
	verify_vcf_samples(vcf, pop_df.indIDs.values, lazy=True)
	pops_to_ignore = ignore_pops.split(",")
	geno, samples = read_vcf(vcf, strict_gt=True, gts012=True)
	assert geno.ndim > 1
	popA = []
	popB = []
	reps = []
	n_hapA = []
	n_hapB = []
	n_hap_paired = []
	hap_divA = []
	hap_divB = []
	hap_div_paired = []
	pop_pairs = create_population_pairs(pop_df.popIDs.values, pops_to_ignore)
	for (p1, p2) in tqdm(pop_pairs):
	print(p1,p2)
	p1_samples = pop_df.indIDs[pop_df.popIDs == p1].values
	p2_samples = pop_df.indIDs[pop_df.popIDs == p2].values
	# print(p1, p2, p1_samples.size, p2_samples.size)
	haps_p1, _ = extract_pops(geno, samples, p1_samples)
	haps_p2, _ = extract_pops(geno, samples, p2_samples)
	full_hap_div1, full_n_hap1 = haplotype_diversity(
	haps_p1, vcf, treat_missing=missing_correct, strict_gt=True, gts012=True
	)
	full_hap_div2, full_n_hap2 = haplotype_diversity(
	haps_p2, vcf, treat_missing=missing_correct, strict_gt=True, gts012=True
	)
	full_hap_diff = np.abs(full_hap_div1 - full_hap_div2)
	full_n_hap_diff = np.abs(full_n_hap1 - full_n_hap2)
	# Running the bootstrapped pairwise_diff
	hap_combined = np.hstack([haps_p1, haps_p2])
	num_haps1, num_haps2, pairwise_n_hap, hap_div1, hap_div2, pairwise_hap_div = bootstrap_haplotype_diversity(
	hap_combined,
	vcf,
	num_reps=nreps,
	treat_missing=missing_correct,
	strict_gt=True,
	)
	num_haps1 = np.insert(num_haps1, 0, full_n_hap1, axis=0)
	num_haps2 = np.insert(num_haps2, 0, full_n_hap2, axis=0)
	hap_div1 = np.insert(hap_div1, 0, full_hap_div1, axis=0)
	hap_div2 = np.insert(hap_div2, 0, full_hap_div2, axis=0)
	pairwise_n_hap = np.insert(pairwise_n_hap, 0, full_n_hap_diff, axis=0)
	pairwise_hap_div = np.insert(pairwise_hap_div, 0, full_hap_diff, axis=0)
	reps.append(np.arange(0, nreps + 1))
	popA.append(np.repeat(p1, nreps + 1))
	popB.append(np.repeat(p2, nreps + 1))
	n_hapA.append(num_haps1)
	n_hapB.append(num_haps2)
	hap_divA.append(hap_div1)
	hap_divB.append(hap_div2)
	n_hap_paired.append(pairwise_n_hap)
	hap_div_paired.append(pairwise_hap_div)

	popA = np.concatenate(popA)
	popB = np.concatenate(popB)
	reps = np.concatenate(reps)
	nhapA = np.concatenate(n_hapA)
	nhapB = np.concatenate(n_hapB)
	nhap_paired = np.concatenate(n_hap_paired)
	hapdivA = np.concatenate(hap_divA)
	hapdivB = np.concatenate(hap_divB)
	hapdiv_paired = np.concatenate(hap_div_paired)
	test_df = pd.DataFrame(
	{
	"popA": popA,
	"popB": popB,
	"reps": reps,
	"n_hapA": nhapA,
	"n_hapB": nhapB,
	"n_hap_paired": nhap_paired,
	"hap_divA": hapdivA,
	"hap_divB": hapdivB,
	"hap_div_paired": hapdiv_paired,
	}
	)
	return test_df


	@click.command()
	@click.option("--vcf", "-v", help="Input VCF file.")
	@click.option("--poplist", "-p", help="List of populations for each sample in the VCF")
	@click.option("--ignore_pops", "-i", type=str, default="", help="Populations to ignore")
	@click.option(
	"--missing_correct",
	"-c",
	type=bool,
	default=True,
	help="Correction for missingness in pairwise differences.",
	)
	@click.option(
	"--nreps", "-n", type=int, default=100, help="Number of bootstrap replicates."
	)
	@click.option(
	"--mode", "-m", default="pairwise_diff", help="Mode for statistics computed."
	)
	@click.option("--seed", "-s", type=int, default=42, help="Random number seed.")
	@click.option("--out", "-o", default="out.csv", help="Output CSV")
	def main(vcf, poplist, ignore_pops, missing_correct, nreps, mode, seed, out):
	"""Run the main function for running scripted analyses."""
	if mode == "pairwise_diff":
	test_df = create_pairwise_diff_df(
	vcf, poplist, ignore_pops, missing_correct, nreps, seed, out
	)
	elif mode == "haplotype_diff":
	test_df = create_pairwise_hap_df(
	vcf, poplist, ignore_pops, missing_correct, nreps, seed, out
	)
	else:
	raise ValueError(f"{mode} is not a valid mode!")
	# Write out the files here ...
	test_df.to_csv(out, index=False)


	if __name__ == "__main__":
	main()
popA	popB	reps	piA	piB	pi_diff
A	B	1	0.5	0.5	0.0
A	B	2	0.3333333333333333	1.0	0.6666666666666667
A	B	3	0.5	0.5	0.0
A	B	4	0.3333333333333333	1.0	0.6666666666666667
A	B	5	0.5	1.0	0.5
A	B	6	0.6666666666666666	0.5	0.16666666666666663
A	B	7	0.6666666666666666	1.0	0.33333333333333337
A	B	8	0.3333333333333333	1.0	0.6666666666666667
A	B	9	0.5	1.0	0.5
A	B	10	0.3333333333333333	1.0	0.6666666666666667
A	B	11	0.5	0.5	0.0
A	B	12	0.5	1.0	0.5
A	B	13	0.3333333333333333	0.5	0.16666666666666669
A	B	14	0.5	1.0	0.5
A	B	15	0.3333333333333333	1.0	0.6666666666666667
A	B	16	0.3333333333333333	1.0	0.6666666666666667
A	B	17	0.6666666666666666	1.0	0.33333333333333337
A	B	18	0.5	0.5	0.0
A	B	19	0.5	0.5	0.0
A	B	20	1.0	1.0	0.0
A	B	21	1.0	1.0	0.0
A	B	22	0.5	0.5	0.0
A	B	23	0.3333333333333333	1.0	0.6666666666666667
A	B	24	0.3333333333333333	1.0	0.6666666666666667
A	B	25	0.6666666666666666	1.0	0.33333333333333337
A	B	26	0.3333333333333333	0.5	0.16666666666666669
A	B	27	0.5	0.5	0.0
A	B	28	0.3333333333333333	0.5	0.16666666666666669
A	B	29	0.5	1.0	0.5
A	B	30	1.0	1.0	0.0
A	B	31	1.0	0.5	0.5
A	B	32	0.5	0.5	0.0
A	B	33	0.5	0.5	0.0
A	B	34	0.5	0.5	0.0
A	B	35	0.3333333333333333	1.0	0.6666666666666667
A	B	36	0.6666666666666666	0.5	0.16666666666666663
A	B	37	1.0	1.0	0.0
A	B	38	0.5	0.5	0.0
A	B	39	0.5	1.0	0.5
A	B	40	0.5	0.5	0.0
A	B	41	0.5	0.5	0.0
A	B	42	1.0	0.5	0.5
A	B	43	0.5	0.5	0.0
A	B	44	0.5	1.0	0.5
A	B	45	0.5	0.5	0.0
A	B	46	0.5	1.0	0.5
A	B	47	1.0	1.0	0.0
A	B	48	0.5	1.0	0.5
A	B	49	1.0	1.0	0.0
A	B	50	0.3333333333333333	0.5	0.16666666666666669
A	B	51	0.3333333333333333	1.0	0.6666666666666667
A	B	52	0.3333333333333333	1.0	0.6666666666666667
A	B	53	0.6666666666666666	1.0	0.33333333333333337
A	B	54	1.0	0.5	0.5
A	B	55	0.3333333333333333	0.5	0.16666666666666669
A	B	56	0.3333333333333333	0.5	0.16666666666666669
A	B	57	0.3333333333333333	0.5	0.16666666666666669
A	B	58	0.3333333333333333	1.0	0.6666666666666667
A	B	59	0.3333333333333333	1.0	0.6666666666666667
A	B	60	0.3333333333333333	0.5	0.16666666666666669
A	B	61	0.6666666666666666	1.0	0.33333333333333337
A	B	62	0.6666666666666666	1.0	0.33333333333333337
A	B	63	0.6666666666666666	1.0	0.33333333333333337
A	B	64	0.3333333333333333	0.5	0.16666666666666669
A	B	65	0.3333333333333333	0.5	0.16666666666666669
A	B	66	0.5	1.0	0.5
A	B	67	0.3333333333333333	0.5	0.16666666666666669
A	B	68	0.6666666666666666	0.5	0.16666666666666663
A	B	69	1.0	0.5	0.5
A	B	70	0.6666666666666666	0.5	0.16666666666666663
A	B	71	0.5	0.5	0.0
A	B	72	0.3333333333333333	1.0	0.6666666666666667
A	B	73	1.0	0.5	0.5
A	B	74	0.6666666666666666	1.0	0.33333333333333337
A	B	75	0.6666666666666666	1.0	0.33333333333333337
A	B	76	0.5	0.5	0.0
A	B	77	0.6666666666666666	1.0	0.33333333333333337
A	B	78	0.3333333333333333	1.0	0.6666666666666667
A	B	79	0.6666666666666666	1.0	0.33333333333333337
A	B	80	1.0	0.5	0.5
A	B	81	0.6666666666666666	0.5	0.16666666666666663
A	B	82	0.6666666666666666	1.0	0.33333333333333337
A	B	83	0.5	0.5	0.0
A	B	84	0.3333333333333333	0.5	0.16666666666666669
A	B	85	0.5	0.5	0.0
A	B	86	0.6666666666666666	0.5	0.16666666666666663
A	B	87	0.3333333333333333	1.0	0.6666666666666667
A	B	88	1.0	1.0	0.0
A	B	89	0.6666666666666666	0.5	0.16666666666666663
A	B	90	0.3333333333333333	0.5	0.16666666666666669
A	B	91	0.6666666666666666	1.0	0.33333333333333337
A	B	92	0.3333333333333333	1.0	0.6666666666666667
A	B	93	0.6666666666666666	0.5	0.16666666666666663
A	B	94	0.5	1.0	0.5
A	B	95	0.5	0.5	0.0
A	B	96	0.3333333333333333	0.5	0.16666666666666669
A	B	97	0.3333333333333333	1.0	0.6666666666666667
A	B	98	0.5	1.0	0.5
A	B	99	0.5	0.5	0.0
A	B	100	0.5	0.5	0.0