# 2021/1/17
# translating sclust file to pyclone's input
# python=3.8
# useage:python sclust_to_pyclone.py -i "Path to allelic file" -v "Path to vcf file" -n "Sample's name" -o "Output direction"
import re
import pandas as pd
import argparse


# find DP and AF in a line
# still can short,for test
def find_DP_and_AF(text):
    pat = r'.*DP=(\d*).*AF=(\d\.*\d*)'
    prog = re.compile(pat)
    match = prog.search(text)
    return (str(match.group(1)), str(match.group(2)))  # (DP,AF)


def genotype(minor):
    if minor == 0:
        return 'BB'
    else:
        return 'AB'


# find Major and Minor copy number
def find_major_and_minor(pos, df, chr):
    aline = df[(df['Start'] < pos) & (df['End'] > pos) & (df['Chromosome'] == chr)][['Chromosome', 'A', 'B']]
    aline = aline.reset_index()
    if aline.shape[0] == 0:
        return (1, 1)
    else:
        return (aline['A'][0], aline['B'][0])  # (major,minor)


# create argparse option
parser = argparse.ArgumentParser(description="deliver file's dir.")
parser.add_argument('--input_allelic', '-i', type=str, help="Path to where allelic data files are stored",
                    metavar='str')
parser.add_argument('--sample_name', '-n', type=str, help="Sample to run", metavar='str')
parser.add_argument('--output_dir', '-o', type=str, help="Directory where the output is saved", metavar='str')
parser.add_argument('--input_vcf', '-v', type=str, help="Path to where vcf data files are stored", metavar='str')
args = parser.parse_args()
allelic_file = args.input_allelic
vcf_file = args.input_vcf
output_dir = args.output_dir
sample = args.sample_name
# header for tsv
header = 'mutation_id\tref_counts\tvar_counts\tnormal_cn\tminor_cn\tmajor_cn\tvariant_case\tvariant_freq\tgenotype\n'
# load file
f1 = open(vcf_file, 'r')
df1 = pd.read_csv(f1, sep='\t', header=None, comment='#')
f1_rows = df1.shape[0]
f2 = open(allelic_file, 'r')
df2 = pd.read_csv(f2, sep='\t', header=0, comment='#')
df2['Start'] = df2['Start'].astype(int)
df2['End'] = df2['End'].astype(int)
f2_rows = df2.shape[0]
# write
with open(output_dir + '\\'+ sample + '.tsv', 'w') as t:
    t.write(header)
    for i in range(f1_rows):
        # get information
        chr = df1[0][i]
        pos = df1[1][i]
        (major, minor) = find_major_and_minor(pos, df2, chr)
        (DP, AF) = find_DP_and_AF(df1[7][i])
        var_counts = int(float(AF) * float(DP))
        geno=genotype(minor)
        # write information
        t.write(sample+':' +geno +':'+chr + ':' + str(pos) + '\t')
        t.write(DP + '\t')
        t.write(str(var_counts) + '\t')
        t.write('2\t')
        t.write(str(minor) + '\t')
        t.write(str(major) + '\t')
        t.write('H2171\t')
        t.write(AF + '\t')
        t.write(geno)
        t.write('\n')

print('Process done')
