#!/usr/bin/env python # -*- coding: ASCII -*- ''' (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2023. All rights reserved. This file describes how to train a CADD model from the provided training data. This script needs about 300 GB of RAM to run properly and will fail to execute on any other machine. ''' ### to run the script, the user needs to have python with numpy and sklearn installed import numpy as np from sklearn.linear_model import LogisticRegression import sklearn.preprocessing from sklearn.externals import joblib ### File names, adjust as needed TRAINING_MATRIX = 'train.npz' OUTPUT_MODEL_FILE = 'CADD.mod' # load training data mat = np.load(TRAINING_MATRIX) Y = mat[:,0].reshape((mat.shape[0],)) X = mat[:,1:] # scaling the training data to unified variance scaler = sklearn.preprocessing.StandardScaler(with_mean=False, copy=False) scaler.fit(X) # train the model clf = LogisticRegression(penalty='l2', C=1, max_iter=13, solver='lbfgs', warm_start=True, verbose=3) clf.fit(X, Y) # store scaler and model joblib.dump((clf, scaler), OUTPUT_MODEL_FILE, 3) ''' The generated model file can be used with the CADD scripts as available at github.com/kirchlab/CADD-scripts Note that the generated models coefficients will not be 100% identical to our CADD models. This happens because we train our models with a random 1% hold-out test set. We further train our models not in a single call of 13 iteration convergence but step-wise one iteration after the other in order to save, evaluate and compare different models. '''