#!/usr/bin/env python
# -*- coding: ASCII -*-

'''
(c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2023. All rights reserved.

This file describes how to train a CADD model from the provided training data.
This script needs about 300 GB of RAM to run properly and will fail to execute
on any other machine.
'''

### to run the script, the user needs to have python with numpy and sklearn installed
import numpy as np
from sklearn.linear_model import LogisticRegression
import sklearn.preprocessing
from sklearn.externals import joblib

### File names, adjust as needed
TRAINING_MATRIX = 'train.npz'
OUTPUT_MODEL_FILE = 'CADD.mod'

# load training data
mat = np.load(TRAINING_MATRIX)
Y = mat[:,0].reshape((mat.shape[0],))
X = mat[:,1:]

# scaling the training data to unified variance
scaler = sklearn.preprocessing.StandardScaler(with_mean=False, copy=False)
scaler.fit(X)

# train the model
clf = LogisticRegression(penalty='l2',
                         C=1,
                         max_iter=13,
                         solver='lbfgs',
                         warm_start=True,
                         verbose=3)
clf.fit(X, Y)

# store scaler and model
joblib.dump((clf, scaler), OUTPUT_MODEL_FILE, 3)

'''
The generated model file can be used with the CADD scripts as available at
github.com/kirchlab/CADD-scripts

Note that the generated models coefficients will not be 100% identical to our
CADD models. This happens because we train our models with a random 1% hold-out
test set. We further train our models not in a single call of 13 iteration
convergence but step-wise one iteration after the other in order to save,
evaluate and compare different models.
'''