• R/O
  • SSH



Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

Commit MetaInfo

修訂372bf58aa1eba13e378314c8b9d343fd7fd4210b (tree)
時間2013-06-25 02:04:38
作者Lorenzo Isella <lorenzo.isella@gmai...>
CommiterLorenzo Isella

Log Message

A code to carry out logistic regression in Python.
I still need to read and understand it and it may be possible that it can
be sped up/improved.

Change Summary


diff -r 90d01d56e457 -r 372bf58aa1eb Python-codes/logistic_regression_updated.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Python-codes/logistic_regression_updated.py Mon Jun 24 19:04:38 2013 +0200
@@ -0,0 +1,164 @@
1+__author__ = 'Miroslaw Horbal'
2+__email__ = 'miroslaw@gmail.com'
3+__date__ = '14-06-2013'
5+from numpy import array, hstack
6+from sklearn import metrics, cross_validation, linear_model
7+from scipy import sparse
8+from itertools import combinations
10+import numpy as np
11+import pandas as pd
13+SEED = 25
15+def group_data(data, degree=3, hash=hash):
16+ """
17+ numpy.array -> numpy.array
19+ Groups all columns of data into all combinations of triples
20+ """
21+ new_data = []
22+ m,n = data.shape
23+ for indicies in combinations(range(n), degree):
24+ new_data.append([hash(tuple(v)) for v in data[:,indicies]])
25+ return array(new_data).T
27+def OneHotEncoder(data, keymap=None):
28+ """
29+ OneHotEncoder takes data matrix with categorical columns and
30+ converts it to a sparse binary matrix.
32+ Returns sparse binary matrix and keymap mapping categories to indicies.
33+ If a keymap is supplied on input it will be used instead of creating one
34+ and any categories appearing in the data that are not in the keymap are
35+ ignored
36+ """
37+ if keymap is None:
38+ keymap = []
39+ for col in data.T:
40+ uniques = set(list(col))
41+ keymap.append(dict((key, i) for i, key in enumerate(uniques)))
42+ total_pts = data.shape[0]
43+ outdat = []
44+ for i, col in enumerate(data.T):
45+ km = keymap[i]
46+ num_labels = len(km)
47+ spmat = sparse.lil_matrix((total_pts, num_labels))
48+ for j, val in enumerate(col):
49+ if val in km:
50+ spmat[j, km[val]] = 1
51+ outdat.append(spmat)
52+ outdat = sparse.hstack(outdat).tocsr()
53+ return outdat, keymap
55+def create_test_submission(filename, prediction):
56+ content = ['id,ACTION']
57+ for i, p in enumerate(prediction):
58+ content.append('%i,%f' %(i+1,p))
59+ f = open(filename, 'w')
60+ f.write('\n'.join(content))
61+ f.close()
62+ print 'Saved'
64+# This loop essentially from Paul's starter code
65+def cv_loop(X, y, model, N):
66+ mean_auc = 0.
67+ for i in range(N):
68+ X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
69+ X, y, test_size=.20,
70+ random_state = i*SEED)
71+ model.fit(X_train, y_train)
72+ preds = model.predict_proba(X_cv)[:,1]
73+ auc = metrics.auc_score(y_cv, preds)
74+ print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
75+ mean_auc += auc
76+ return mean_auc/N
78+def main(train='train.csv', test='test.csv', submit='logistic_pred.csv'):
79+ print "Reading dataset..."
80+ train_data = pd.read_csv(train)
81+ test_data = pd.read_csv(test)
82+ all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1]))
84+ num_train = np.shape(train_data)[0]
86+ # Transform data
87+ print "Transforming data..."
88+ dp = group_data(all_data, degree=2)
89+ dt = group_data(all_data, degree=3)
91+ y = array(train_data.ACTION)
92+ X = all_data[:num_train]
93+ X_2 = dp[:num_train]
94+ X_3 = dt[:num_train]
96+ X_test = all_data[num_train:]
97+ X_test_2 = dp[num_train:]
98+ X_test_3 = dt[num_train:]
100+ X_train_all = np.hstack((X, X_2, X_3))
101+ X_test_all = np.hstack((X_test, X_test_2, X_test_3))
102+ num_features = X_train_all.shape[1]
104+ model = linear_model.LogisticRegression()
106+ # Xts holds one hot encodings for each individual feature in memory
107+ # speeding up feature selection
108+ Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)]
110+ print "Performing greedy feature selection..."
111+ score_hist = []
112+ N = 10
113+ good_features = set([])
114+ # Greedy feature selection loop
115+ while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
116+ scores = []
117+ for f in range(len(Xts)):
118+ if f not in good_features:
119+ feats = list(good_features) + [f]
120+ Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
121+ score = cv_loop(Xt, y, model, N)
122+ scores.append((score, f))
123+ print "Feature: %i Mean AUC: %f" % (f, score)
124+ good_features.add(sorted(scores)[-1][1])
125+ score_hist.append(sorted(scores)[-1])
126+ print "Current features: %s" % sorted(list(good_features))
128+ # Remove last added feature from good_features
129+ good_features.remove(score_hist[-1][1])
130+ good_features = sorted(list(good_features))
131+ print "Selected features %s" % good_features
133+ print "Performing hyperparameter selection..."
134+ # Hyperparameter selection loop
135+ score_hist = []
136+ Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr()
137+ Cvals = np.logspace(-4, 4, 15, base=2)
138+ for C in Cvals:
139+ model.C = C
140+ score = cv_loop(Xt, y, model, N)
141+ score_hist.append((score,C))
142+ print "C: %f Mean AUC: %f" %(C, score)
143+ bestC = sorted(score_hist)[-1][1]
144+ print "Best C value: %f" % (bestC)
146+ print "Performing One Hot Encoding on entire dataset..."
147+ Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features]))
148+ Xt, keymap = OneHotEncoder(Xt)
149+ X_train = Xt[:num_train]
150+ X_test = Xt[num_train:]
152+ print "Training full model..."
153+ model.fit(X_train, y)
155+ print "Making prediction and saving results..."
156+ preds = model.predict_proba(X_test)[:,1]
157+ create_test_submission(submit, preds)
159+if __name__ == "__main__":
160+ args = { 'train': 'train.csv',
161+ 'test': 'test.csv',
162+ 'submit': 'logistic_regression_pred.csv' }
163+ main(**args)