From 8d7309567179761994a29a396fb772016ffce409 Mon Sep 17 00:00:00 2001
From: Julian Linke <jlinke-tug@aim-gpu6.spsc.tugraz.at>
Date: Thu, 27 Jul 2023 20:23:23 +0200
Subject: [PATCH] initial commit

---
 local/codebook_combine_arrays.py |  63 +++++++++++++
 local/codebook_freqs.py          | 136 +++++++++++++++++++++++++++
 local/create_xlsr_new.py         |  23 +++++
 local/plot_pca_similarities.py   | 156 +++++++++++++++++++++++++++++++
 local/prepare_data.py            |  53 +++++++++++
 local/similarity_matrix.py       |  96 +++++++++++++++++++
 path.sh                          |   2 +
 run.sh                           |  93 ++++++++++++++++++
 8 files changed, 622 insertions(+)
 create mode 100644 local/codebook_combine_arrays.py
 create mode 100644 local/codebook_freqs.py
 create mode 100644 local/create_xlsr_new.py
 create mode 100644 local/plot_pca_similarities.py
 create mode 100644 local/prepare_data.py
 create mode 100644 local/similarity_matrix.py
 create mode 100755 path.sh
 create mode 100755 run.sh

diff --git a/local/codebook_combine_arrays.py b/local/codebook_combine_arrays.py
new file mode 100644
index 0000000..bde5ca8
--- /dev/null
+++ b/local/codebook_combine_arrays.py
@@ -0,0 +1,63 @@
+# Author: Julian Linke (linke@tugraz.at)
+# SPSC TU Graz (July 2023)
+
+import os, sys
+import fairseq
+import torch, torchaudio
+import numpy as np
+import json
+
+def main(exp_path, json_path):
+    # load json ... 
+    # dict with {"corpusA_speakingstyle": [spk1 spk2,...], 
+    #            "corpusB_speakingstyle": [spk1 spk2,...], ...}
+    with open(json_path, 'r') as f:
+        corpora = json.load(f)
+
+    cnt = 0
+    for corpus in corpora:
+        freqN_path = os.path.join(exp_path, corpus)
+        for freqN_vec in os.listdir(freqN_path):
+            if 'freqN' in freqN_vec:
+                cnt = cnt+1
+    print('counted {} freqN-files'.format(cnt))
+
+    freqN_convs = np.zeros((cnt, 320**2), dtype=np.float32)
+    spks_vec = np.zeros((cnt,), dtype=object) 
+    idx = 0
+    print('... start combining freqN_*.npy-files coming from path {}'.format(exp_path))
+    for corpus in corpora:
+        style = corpus.split("_")[1] # second entry is always style
+        freqN_path = os.path.join(exp_path, corpus)
+        for freqN_vec in os.listdir(freqN_path):
+            if 'freqN' in freqN_vec:
+                #print(freqN_vec)
+                split = freqN_vec.split('_')[1].replace('.npy','')
+                print('read and append split {} ...'.format(os.path.join(freqN_path,freqN_vec)))
+                # combine
+                freqN_convs[idx,:] = np.load(os.path.join(freqN_path,freqN_vec))
+                spks_vec[idx] = f'{split}{style}'
+                idx = idx + 1
+    freq_path, splits_path = os.path.join(exp_path, 'splits_freqs'), os.path.join(exp_path, 'splits_labels')
+    print('\nwrite {}.npy and {}.npy'.format(freq_path, splits_path))
+    np.save(freq_path, freqN_convs)
+    np.save(splits_path, spks_vec)
+    print('\nwrite {}.tsv and {}.tsv'.format(freq_path, splits_path))
+    np.savetxt(freq_path+'.tsv', freqN_convs, delimiter='\t')
+    np.savetxt(splits_path+'.tsv', spks_vec, fmt='%s', delimiter='\t')
+    
+if __name__ == "__main__":
+    ############################# EXP PATH ###########################
+    try:
+        exp_path = sys.argv[1]
+        print("exp path is: " + exp_path)
+    except:
+        print("ERROR: data_path not specified")
+    ############################# JSON PATH ###########################
+    try:
+        json_path = sys.argv[2]
+        print("json path is: " + json_path)
+    except:
+        print("ERROR: json_path not specified")
+
+    main(exp_path, json_path)
diff --git a/local/codebook_freqs.py b/local/codebook_freqs.py
new file mode 100644
index 0000000..0b7e2ea
--- /dev/null
+++ b/local/codebook_freqs.py
@@ -0,0 +1,136 @@
+# Author: Julian Linke (linke@tugraz.at)
+# SPSC TU Graz (July 2023)
+
+import os, sys
+import fairseq
+import torch, torchaudio
+import numpy as np
+import json
+
+def print_list(l):
+    for s in l:
+        print(s)
+
+def count_freq(codebook_all_indexes, freq):
+    seq = []
+    for t in range(0, len(codebook_all_indexes)):
+        codebook_idx_at_time_t = codebook_all_indexes[t].item() # tensor(integer).item()
+        if VERBOSE: 
+            print('(Verbose) frame={}: used codebook entry: {}'.format(t+1, codebook_idx_at_time_t))
+        seq.append(codebook_idx_at_time_t)
+        freq[codebook_idx_at_time_t] = freq[codebook_idx_at_time_t] + 1  
+    return freq, seq
+
+def calc_codebook_indexes(audio_path, freq, N):
+    x, fs = torchaudio.load(audio_path)
+    x = x.to(device) # torch.Size([1, 57120]) [1 x Samples]
+
+    if np.shape(x)[1] > 512:
+        C = model.quantize(x)
+        quantized_features = C[0][0] # torch.Size([178, 768]) [T x d]
+        codebook_G2_indices = C[1] # torch.Size([1, 178, 2]) [1 x T x G]; G=2
+        codebook_all_indexes = model.quantizer.to_codebook_index(codebook_G2_indices)[0] # torch.Size([178]) [T]
+        Nwav = len(codebook_all_indexes)
+        freq, seq = count_freq(codebook_all_indexes, freq)
+        N = N + Nwav
+        print('feature vectors: {}/{} (file/all)'.format(Nwav, N))
+    else:
+        print('WARNING: Input size of file is {} (smaller than Kernel size), skip ...'.format(len(x)))
+    return freq, Nwav
+
+def main(exp_path, lst_path, json_path, model_path, VERBOSE):
+    # set model and device global
+    global model, device
+    device = torch.device('cuda')
+    # load existing model
+    model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_path])
+    model = model[0]
+    model = model.to(device)
+    # load json ... 
+    # dict with {"corpusA_speakingstyle": [spk1 spk2,...], 
+    #            "corpusB_speakingstyle": [spk1 spk2,...], ...}
+    with open(json_path, 'r') as f:
+        corpora = json.load(f)
+    # corpus loop
+    for corpus in corpora.keys():
+        # LOGFILE:
+        if VERBOSE: 
+            sys.stdout = open(os.path.join(exp_path, 'logs', "codebook_freqs_{}.log".format(corpus)),"w")
+        # speaker list and style
+        spks = corpora[corpus]
+        style = corpus.split("_")[1] # second entry is always style
+        # prepare numpy arrays
+        freqN_spks = np.zeros((len(spks), 320**2), dtype=np.float32) # [SPKS x 120400]
+        spks_vec = np.zeros((len(spks),), dtype=object) # [SPKS x 1]
+        # speaker loop
+        for idx, spk in enumerate(spks):
+            print(f"\n--- speaker {spk} in corpus {corpus} ---")
+            freq, N = dict.fromkeys(range(1, 320**2+1), 0), 0
+            processed_files = []
+            # extract frequencies per speaker
+            with open(lst_path, 'r') as tsv:
+                rows = tsv.readlines()
+                for row in rows:
+                    uttID, audio_path = row.split()
+                    corpus_match = ('_').join(audio_path.split('/')[1].split('_')[1:]) # DATA/data_corpus_speakingstyle/spk/*wav
+                    spk_match = audio_path.split('/')[2] # DATA/data_corpus_speakingstyle/spk/*wav
+                    if corpus == corpus_match and spk == spk_match: 
+                        print('\nread wav-file {}'.format(audio_path))
+                        freq, Nwav = calc_codebook_indexes(audio_path, freq, N)
+                        N = N + Nwav
+                        processed_files.append(audio_path)
+                print(f'(DONE) Found {N} observations for speaker {spk} ...')
+            # combine frequencies
+            freqN_vec = np.zeros((1, 320**2), dtype=np.float32)
+            os.system('mkdir -p {}'.format(os.path.join(exp_path, 'txt', corpus)))
+            os.system('mkdir -p {}'.format(os.path.join(exp_path, 'numpy', corpus)))
+            with open(os.path.join(exp_path, 'txt', corpus, 'freq_{}.txt'.format(spk)), 'w') as ffreq, \
+                open(os.path.join(exp_path, 'txt', corpus, 'freqN_{}.txt'.format(spk)), 'w') as ffreqN:
+                for i, code_entry in enumerate(freq.keys()):
+                    ffreq.write('{}\t{}\n'.format(code_entry, freq[code_entry]))
+                    ffreqN.write('{}\t{}\n'.format(code_entry, freq[code_entry]/N))
+                    freqN_vec[0,i] = freq[code_entry]/N
+            np.save(os.path.join(exp_path, 'numpy', corpus, 'freqN_{}'.format(spk,style)), freqN_vec)
+            # combine frequencies per corpus
+            print(f"... speaker {spk} is column {idx} of array freq_{corpus}.npy!")
+            freqN_spks[idx,:] = freqN_vec
+            spks_vec[idx] = f'{spk}{style}'
+        # write combined frequencies
+        np.save(os.path.join(exp_path, 'numpy', corpus, 'freq_{}'.format(corpus)), freqN_spks)
+        np.save(os.path.join(exp_path, 'numpy', corpus, 'spkIDs_{}'.format(corpus)), spks_vec)
+
+if __name__ == "__main__":
+    ############################# EXP PATH ###########################
+    try:
+        exp_path = sys.argv[1]
+        print("\nexp path is: " + exp_path)
+    except:
+        print("ERROR: data_path not specified")
+    ############################# LIST PATH ###########################
+    try:
+        lst_path = sys.argv[2]
+        print("list path is: " + lst_path)
+    except:
+        print("ERROR: lst_path not specified")
+    ############################# JSON PATH ###########################
+    try:
+        json_path = sys.argv[3]
+        print("json path is: " + json_path)
+    except:
+        print("ERROR: json_path not specified")
+    ############################# MODEL PATH ###########################
+    try:
+        model_path = sys.argv[4]
+        print("model path is: " + model_path)
+    except:
+        print("ERROR: model_path not specified")
+    ############################# VERBOSE #############################
+    global VERBOSE
+    try:
+        VERBOSE = int(sys.argv[5])
+        print("VERBOSE is " + str(VERBOSE) + "\n")
+    except:
+        print("VERBOSE is not specified, default is 0!")
+        VERBOSE = 0
+
+    main(exp_path, lst_path, json_path, model_path, VERBOSE)
diff --git a/local/create_xlsr_new.py b/local/create_xlsr_new.py
new file mode 100644
index 0000000..0b0589a
--- /dev/null
+++ b/local/create_xlsr_new.py
@@ -0,0 +1,23 @@
+# https://github.com/facebookresearch/fairseq/issues/3741
+
+from omegaconf import DictConfig, OmegaConf, open_dict
+import torch
+
+cp_path = 'model/xlsr_53_56k.pt'
+cp = torch.load(cp_path)
+cfg = DictConfig(cp['cfg'])
+dd = OmegaConf.to_container(cfg, resolve=True)
+for k,v in dd.items():
+    if not isinstance(v, dict):
+        continue
+    for key, _ in v.items():
+        if key.split("_")[:2] == ["eval", "wer"]:
+            print(k,key)
+with open_dict(cfg):
+    cfg.task.pop('eval_wer')
+    cfg.task.pop('eval_wer_config')
+    cfg.task.pop('eval_wer_tokenizer')
+    cfg.task.pop('eval_wer_post_process')
+    cfg.task.pop('autoregressive')
+cp['cfg'] = cfg
+torch.save(cp, 'model/xlsr_53_56k_new.pt')
\ No newline at end of file
diff --git a/local/plot_pca_similarities.py b/local/plot_pca_similarities.py
new file mode 100644
index 0000000..431e072
--- /dev/null
+++ b/local/plot_pca_similarities.py
@@ -0,0 +1,156 @@
+# Author: Julian Linke (linke@tugraz.at)
+# SPSC TU Graz (July 2023)
+
+import os, sys
+import json
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+from sklearn.metrics.pairwise import cosine_similarity 
+from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.metrics import pairwise_distances
+import faiss
+from mpl_toolkits.mplot3d import Axes3D
+import matplotlib.patches as patches
+
+# List of predefined colors for each corpus (you can add more if you like)
+colors = ['tab:red', 'tab:blue', 
+          'tab:green', 'tab:orange', 
+          'tab:purple', 'tab:brown', 
+          'tab:pink', 'tab:gray', 
+          'tab:olive', 'tab:cyan']
+# markersize
+MS = 30
+
+def main(X_path, y_path, pcaA_path, pcab_path, out_path, json_path):
+    # load json ... 
+    # dict with {"corpusA_speakingstyle": [spk1 spk2,...], 
+    #            "corpusB_speakingstyle": [spk1 spk2,...], ...}
+    with open(json_path, 'r') as f:
+        corpora = json.load(f)
+
+    X = np.load(X_path)
+    spks = np.load(y_path, allow_pickle=True)
+
+    A = np.load(pcaA_path)
+    b = np.load(pcab_path)
+    print('\nspkIDs:\n{}'.format(spks))
+    print('\ninput matrix:\n{}'.format(X))
+
+    labels = list(spks)
+    fig, ax = plt.subplots(figsize=(15,15))
+    cax = ax.matshow(X, interpolation='nearest')
+    ax.grid()
+    plt.xticks(range(len(labels)), labels, rotation=90);
+    plt.yticks(range(len(labels)), labels);
+    fig.colorbar(cax, ticks=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, .8, .9, 1])
+    fig_path = os.path.join(out_path,'{}.png'.format(X_path.split('/')[-1].replace('.npy','')))
+    print('\nsave fig {}'.format(fig_path))
+    plt.savefig(fig_path)
+
+    colordict = {}
+    for corpus in corpora:
+        colordict[corpus] = []
+    assert len(corpora.keys()) <= len(colors), 'Not enough colors for corpora'
+    for idx, corpus in enumerate(corpora):
+        spks_tmp = corpora[corpus]
+        for spk in spks_tmp:
+            colordict[corpus].append(colors[idx])
+    print(colordict)
+    col = sum([lst for lst in colordict.values()], [])
+
+    print("\nProject with PCA (3 dimensions)...")
+    X_proj = np.dot(X, A) + b # [styles x 320^2] * [320^2 x 2] or [styles x corpora] + [corpora x 2]
+    print(f"\nshape(X_proj) = {np.shape(X_proj)}")
+    print('mean(X_proj,0): {}'.format(np.mean(X_proj,0)))
+    
+    # 3D plot:
+    fig = plt.figure()
+    ax = fig.add_subplot(111, projection='3d')
+    ax.scatter(X_proj[:,0], X_proj[:,1], X_proj[:,2], c=col)
+    # Set the labels and title
+    ax.set_xlabel('PCA1')
+    ax.set_ylabel('PCA2')
+    ax.set_zlabel('PCA3')
+    # Create the legend
+    for k,v in colordict.items():
+        ax.scatter([], [], [], c=v[0], label=k, alpha=1, s=MS)
+    # Shrink current axis by 20%
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0, box.width * 0.85, box.height])
+    # Put a legend to the right of the current axis
+    ax.legend(loc='center left', bbox_to_anchor=(1.175, .5))
+    # Save the plot
+    fig_path = os.path.join(out_path,'scatter3D_{}.png'.format(X_path.split('/')[-1].replace('.npy','')))
+    print('\nsave fig {}'.format(fig_path))
+    plt.savefig(fig_path)
+    fig_path = os.path.join(out_path,'scatter3D_{}.eps'.format(X_path.split('/')[-1].replace('.npy','')))
+    print('save fig {}'.format(fig_path))
+    plt.savefig(fig_path, format='eps')
+    
+    plt.rcParams.update({'font.size': 50})
+    for proj in [(0, 1), (0, 2), (1, 2)]:
+        x, y = X_proj[:,proj[0]], X_proj[:,proj[1]]
+        fig, ax = plt.subplots(figsize=(18,16))
+
+        for i, txt in enumerate(spks):
+            print(f"speaker {txt}: x = {x[i]}, y = {y[i]}")
+            ax.plot(x[i], y[i], 'o', color=col[i], markersize=MS, alpha=.5, label=col[i])
+
+        handles, labels = plt.gca().get_legend_handles_labels()
+        by_label = dict(zip(labels, handles))
+
+        cols = list(by_label.keys())
+        for old, new in zip(cols, corpora.keys()):
+            by_label[new] = by_label.pop(old)
+        ax.legend(by_label.values(), by_label.keys(), fontsize=30)
+        plt.xlabel(f'PCA{str(proj[0]+1)}')
+        plt.ylabel(f'PCA{str(proj[1]+1)}')
+        plt.grid()
+        plt.tight_layout()
+        fig_path = os.path.join(out_path,'scatter_proj{}_{}.png'.format(''.join([str(proj[0]),str(proj[1])]), X_path.split('/')[-1].replace('.npy','')))
+        print('\nsave fig {}'.format(fig_path))
+        plt.savefig(fig_path)
+        fig_path = os.path.join(out_path,'scatter_proj{}_{}.eps'.format(''.join([str(proj[0]),str(proj[1])]), X_path.split('/')[-1].replace('.npy','')))
+        print('save fig {}'.format(fig_path))
+        plt.savefig(fig_path, format='eps')
+
+if __name__ == "__main__":
+    ############################# FEATURES PATH ###########################
+    try:
+        X_path = sys.argv[1]
+        print("input matrix path is: " + X_path)
+    except:
+        print("ERROR: X_path not specified")
+    ############################# LABELS PATH ###########################
+    try:
+        y_path = sys.argv[2]
+        print("label vector path is: " + y_path)
+    except:
+        print("ERROR: y_path not specified")
+    ############################# PCA A PATH ###########################
+    try:
+        pcaA_path = sys.argv[3]
+        print("pca matrix A path is: " + pcaA_path)
+    except:
+        print("ERROR: pcaA_path not specified")
+    ############################# PCA b PATH ###########################
+    try:
+        pcab_path = sys.argv[4]
+        print("pca vector b path is: " + pcab_path)
+    except:
+        print("ERROR: pcab_path not specified")
+    ############################# OUT PATH ###########################
+    try:
+        out_path = sys.argv[5]
+        print("output path is: " + out_path)
+    except:
+        print("ERROR: out_path not specified")
+    ############################# JSON PATH ###########################
+    try:
+        json_path = sys.argv[6]
+        print("json path is: " + json_path)
+    except:
+        print("ERROR: json_path not specified")
+
+    main(X_path, y_path, pcaA_path, pcab_path, out_path, json_path)
diff --git a/local/prepare_data.py b/local/prepare_data.py
new file mode 100644
index 0000000..1643678
--- /dev/null
+++ b/local/prepare_data.py
@@ -0,0 +1,53 @@
+# Author: Julian Linke (linke@tugraz.at)
+# SPSC TU Graz (July 2023)
+
+import os
+import json
+import argparse
+
+def get_speaker_ids_and_lst_lines(corpus_dir):
+    spk_ids = []
+    lst_lines = []
+    for spk in os.listdir(corpus_dir):
+        spk_dir = os.path.join(corpus_dir, spk)
+        if os.path.isdir(spk_dir):
+            spk_ids.append(spk)
+            for audio_file in os.listdir(spk_dir):
+                if audio_file.endswith('.wav') or audio_file.endswith('.flac'):
+                    uttID = audio_file.replace('.wav','').replace('.flac','')
+                    audio_path = os.path.join(spk_dir, audio_file)
+                    lst_line = f"{uttID} {audio_path}\n"
+                    lst_lines.append(lst_line)
+    return spk_ids, lst_lines
+
+def process_DATA_directory(DATA_dir):
+    spk_dict = {}
+    lst_lines = []
+    for corpus in os.listdir(DATA_dir):
+        corpus_dir = os.path.join(DATA_dir, corpus)
+        if os.path.isdir(corpus_dir):
+            corpus_name = '_'.join(corpus.split('_')[1:])
+            spk_ids, new_lst_lines = get_speaker_ids_and_lst_lines(corpus_dir)
+            spk_dict[corpus_name] = spk_ids
+            lst_lines.extend(new_lst_lines)
+    return spk_dict, lst_lines
+
+def write_output_files(output_lst_path, output_json_path, lst_lines, spk_dict):
+    with open(output_lst_path, 'w') as f:
+        f.writelines(lst_lines)
+
+    with open(output_json_path, 'w') as f:
+        json.dump(spk_dict, f, indent=4)
+
+def main(output_lst_path, output_json_path, DATA_dir):
+    spk_dict, lst_lines = process_DATA_directory(DATA_dir)
+    #print(lst_lines)
+    write_output_files(output_lst_path, output_json_path, lst_lines, spk_dict)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Generate DATA.lst and DATA.json files.')
+    parser.add_argument('--output_lst_path', required=True, help='Path to the output .lst file.')
+    parser.add_argument('--output_json_path', required=True, help='Path to the output .json file.')
+    parser.add_argument('--DATA_dir', required=True, help='Path to the DATA directory.')
+    args = parser.parse_args()
+    main(args.output_lst_path, args.output_json_path, args.DATA_dir)
diff --git a/local/similarity_matrix.py b/local/similarity_matrix.py
new file mode 100644
index 0000000..9912ee4
--- /dev/null
+++ b/local/similarity_matrix.py
@@ -0,0 +1,96 @@
+# Author: Julian Linke (linke@tugraz.at)
+# SPSC TU Graz (July 2023)
+
+import os, sys
+import json
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity 
+
+from scipy.stats import entropy
+from numpy.linalg import norm
+
+def JSD(P, Q):
+    _P = P / norm(P, ord=1)
+    _Q = Q / norm(Q, ord=1)
+    _M = 0.5 * (_P + _Q)
+    return 1 - (0.5 * (entropy(_P, _M) + entropy(_Q, _M)))
+
+def JSD_similarity(X):
+    sim = np.zeros((np.shape(X)[0], np.shape(X)[0]), dtype=np.float32)
+    print('\ncalculate JSD similarity matrix of X: {} ...'.format(X.shape))
+    for row, x in enumerate(X):
+        for col, _ in enumerate(sim):
+            #print('calculate JSD similarity of features ({}, {})'.format(row, col))
+            sim[row, col] = JSD(x, X[col,:])
+    return sim
+
+def main(X_path, y_path, out_path, json_path):
+    # load json ... 
+    # dict with {"corpusA_speakingstyle": [spk1 spk2,...], 
+    #            "corpusB_speakingstyle": [spk1 spk2,...], ...}
+    with open(json_path, 'r') as f:
+        corpora = json.load(f)
+
+    # LOAD MATRIX AND SPLITS
+    X = np.load(X_path)
+    splits = np.load(y_path, allow_pickle=True)
+    print('\ninput splits: {}\n... len: {}'.format(splits, len(splits)))
+    corpora_lengths = {key: len(value) for key, value in corpora.items()}
+        
+    # SORT:
+    d_splits, col = {}, []
+    for idx, split in enumerate(splits):
+        d_splits[split] = X[idx,:]
+    X = np.zeros((np.shape(X)), dtype=np.float32)
+    splits = np.zeros((np.shape(splits)), dtype=object)
+    i = 0
+    for corpus in corpora:
+        for split in corpora[corpus]:
+            style = corpus.split("_")[1] # second entry is always style
+            split = f'{split}{style}'
+            if split in d_splits:
+                X[i,:] = d_splits[split]
+                splits[i] = split
+                i = i + 1
+            else: print('Wrong corpora entry: {}?'.format(split))
+    print('\nsorted splits: {}\n... len: {}'.format(splits, len(splits)))
+
+    print('\nwrite sorted {} and {}'.format(X_path, y_path))
+    np.save(X_path, X)
+    np.save(y_path, splits)
+    print('write sorted {} and {}'.format(X_path, y_path))
+    np.savetxt(X_path.replace('.npy','.tsv'), X, delimiter='\t')
+    np.savetxt(y_path.replace('.npy','.tsv'), splits, fmt='%s', delimiter='\t')
+
+    # Calculate similarity matrix and save:
+    X_sim = JSD_similarity(X)
+    np.save(os.path.join(out_path,'similarity_matrix'), X_sim)
+    np.savetxt(os.path.join(out_path,'similarity_matrix.tsv'), X_sim, delimiter='\t')
+
+if __name__ == "__main__":
+    ############################# FEATURES PATH ###########################
+    try:
+        X_path = sys.argv[1]
+        print("input matrix path is: " + X_path)
+    except:
+        print("ERROR: X_path not specified")
+    ############################# LABELS PATH ###########################
+    try:
+        y_path = sys.argv[2]
+        print("label vector path is: " + y_path)
+    except:
+        print("ERROR: y_path not specified")
+    ############################# OUT PATH ###########################
+    try:
+        out_path = sys.argv[3]
+        print("output path is: " + out_path)
+    except:
+        print("ERROR: out_path not specified")
+    ############################# JSON PATH ###########################
+    try:
+        json_path = sys.argv[4]
+        print("json path is: " + json_path)
+    except:
+        print("ERROR: json_path not specified")
+
+    main(X_path, y_path, out_path, json_path)
diff --git a/path.sh b/path.sh
new file mode 100755
index 0000000..d7c0ec1
--- /dev/null
+++ b/path.sh
@@ -0,0 +1,2 @@
+export CWD=$(pwd)
+export FAIRSEQ=../fairseq
\ No newline at end of file
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..47b0e31
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#set -x
+
+# Author: Julian Linke (linke@tugraz.at)
+# SPSC TU Graz (July 2023)
+
+set -e -o pipefail
+. path.sh
+. conda.sh
+
+if [[ $# -eq 0 ]] ; then
+    echo 'ERROR: this run-script requires an argument: stage=?'
+    exit 1
+fi
+
+## runDATA/STAGE
+runDATA=$1
+stage=$2
+VERBOSE=1 # write logs for codebook frequency extraction?
+printf "\n### STAGE ###\n"
+printf "stage: %d\n" $stage
+printf "### STAGE ###\n"
+
+## DIRS/PATHS 
+model_path=model/xlsr_53_56k_new.pt
+exp_dir=exp_$runDATA
+
+## STAGE 0: DELETE AND RUN ALL STAGES
+if [ $stage == 0 ]; then
+    printf "\n... Delete old experiment and run all ...\n"
+    rm -rf ${exp_dir}
+fi
+
+## print:
+printf "\nCWD: %s" "$CWD"
+printf "\nFAIRSEQ: %s" "$FAIRSEQ"
+printf "\nrunDATA: %s" "$runDATA"
+printf "\nmodel_path: %s" "$model_path"
+printf "\nexp_dir: %s\n\n" "$exp_dir"
+
+## CREATE EXPERIMENT FOLDER
+mkdir -p $exp_dir
+mkdir -p $exp_dir/logs
+mkdir -p $exp_dir/data
+mkdir -p $exp_dir/plots
+mkdir -p $exp_dir/txt
+mkdir -p $exp_dir/numpy
+mkdir -p $exp_dir/numpy/pca
+
+## PREPARE DATA
+if [ $stage == 1 ]  || [ $stage == 0 ]; then
+    printf "\n... Prepare data (*lst and *json) ...\n"
+    python3 local/prepare_data.py --output_lst_path $exp_dir/data/${runDATA}.lst \
+                                 --output_json_path $exp_dir/data/${runDATA}.json \
+                                 --DATA_dir ${runDATA}
+fi
+
+if [ $stage == 2 ] || [ $stage == 0 ]; then
+    printf "\n... Count frequencies of codebooks ...\n"
+    python3 local/codebook_freqs.py $exp_dir \
+                                   $exp_dir/data/${runDATA}.lst \
+                                   $exp_dir/data/${runDATA}.json \
+                                   $model_path \
+                                   $VERBOSE
+    printf "\n... Combine Arrays ... \n"
+    python3 local/codebook_combine_arrays.py $exp_dir/numpy \
+                                             $exp_dir/data/${runDATA}.json
+fi
+
+if [ $stage == 3 ] || [ $stage == 0 ]; then
+    printf "\n... Similarity Matrix ...\n"
+    python3 local/similarity_matrix.py \
+            $exp_dir/numpy/splits_freqs.npy \
+            $exp_dir/numpy/splits_labels.npy \
+            $exp_dir/numpy \
+            $exp_dir/data/${runDATA}.json
+fi
+
+if [ $stage == 4 ] || [ $stage == 0 ]; then
+    printf "\n... PCA of similarity matrix ...\n"
+    python3 $FAIRSEQ/examples/wav2vec/unsupervised/scripts/pca.py \
+        $exp_dir/numpy/similarity_matrix.npy \
+        --output $exp_dir/numpy/pca \
+        --dim 3
+    printf "\n... PLOT ...\n"
+    python3 local/plot_pca_similarities.py \
+        $exp_dir/numpy/similarity_matrix.npy \
+        $exp_dir/numpy/splits_labels.npy \
+        $exp_dir/numpy/pca/3_pca_A.npy \
+        $exp_dir/numpy/pca/3_pca_b.npy \
+        $exp_dir/plots \
+        $exp_dir/data/${runDATA}.json
+fi
\ No newline at end of file
-- 
GitLab