__author__ = 'leihuang'
import random
yeast_data_file = './data/DIP_Yeast_BPID.txt'
human_data_file = './data/HPRD_IDI.txt'
output_path = './data/train_test_2018/'
yeast_data_cleaned_file = './data/DIP_Yeast_BPID_cleaned.txt'
human_data_cleaned_file = './data/HPRD_IDI_cleaned.txt'
#output_prefix = human_ppi_

def train_test_builder(data_file, alpha, output_prefix, r):
    random.seed()
    training_data = output_path+output_prefix+str(alpha)+'rand_training_r'+str(r)+'.txt'
    testing_data = output_path+output_prefix+str(alpha)+'rand_testing_r'+str(r)+'.txt'
    ftr = open(training_data, 'w')
    ftt = open(testing_data, 'w')
    for line in open(data_file, 'r'):
        if random.random() <= alpha:
            ftr.write(line)
        else:
            ftt.write(line)
    ftr.close()
    ftt.close()

def remove_self_interaction_duplicates(data_file, output_file):
    edge_set = set()
    max_node = 0
    fw = open(output_file, 'w')
    for line in open(data_file, 'r'):
        a, b = map(int, line.strip().split('\t')[0:2])
        if a == b:
            print('self interaction {}, {}'.format(a, b))
        elif (a, b) and (b, a) not in edge_set:
            fw.write(line)
            edge_set.add((a, b))
            edge_set.add((b, a))
    fw.close()

#remove_self_interaction_duplicates(yeast_data_file, yeast_data_cleaned_file)
#remove_self_interaction_duplicates(human_data_file, human_data_cleaned_file)
for i in range(1, 6):
    train_test_builder(yeast_data_cleaned_file, 0.125, 'yeast_ppi_', i)
    train_test_builder(human_data_cleaned_file, 0.125, 'human_ppi_', i)
    train_test_builder(yeast_data_cleaned_file, 0.25, 'yeast_ppi_', i)
    train_test_builder(human_data_cleaned_file, 0.25, 'human_ppi_', i)
