forked from apache/systemds
-
Notifications
You must be signed in to change notification settings - Fork 0
/
stratstats.dml
396 lines (346 loc) · 21.5 KB
/
stratstats.dml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# STRATIFIED BIVARIATE STATISTICS, VERSION 4
#
# INPUT PARAMETERS:
# -----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# -----------------------------------------------------------------------------
# X String --- Location to read matrix X that has all 1-st covariates
# Y String " " Location to read matrix Y that has all 2-nd covariates
# the default value " " means "use X in place of Y"
# S String " " Location to read matrix S that has the stratum column
# the default value " " means "use X in place of S"
# Xcid String " " Location to read the 1-st covariate X-column indices
# the default value " " means "use columns 1 : ncol(X)"
# Ycid String " " Location to read the 2-nd covariate Y-column indices
# the default value " " means "use columns 1 : ncol(Y)"
# Scid Int 1 Column index of the stratum column in S
# O String --- Location to store the output matrix (see below)
# fmt String "text" Matrix output format, usually "text" or "csv"
# -----------------------------------------------------------------------------
# Note: the stratum column must contain small positive integers; all fractional
# values are rounded; strata with ID <= 0 or NaN are treated as missing.
#
# OUTPUT MATRIX:
# One row per each distinct pair (1st covariate, 2nd covariate)
# 40 columns containing the following information:
# Col 01: 1st covariate X-column number
# Col 02: 1st covariate global presence count
# Col 03: 1st covariate global mean
# Col 04: 1st covariate global standard deviation
# Col 05: 1st covariate stratified standard deviation
# Col 06: R-squared, 1st covariate vs. strata
# Col 07: adjusted R-squared, 1st covariate vs. strata
# Col 08: P-value, 1st covariate vs. strata
# Col 09-10: Reserved
# Col 11: 2nd covariate Y-column number
# Col 12: 2nd covariate global presence count
# Col 13: 2nd covariate global mean
# Col 14: 2nd covariate global standard deviation
# Col 15: 2nd covariate stratified standard deviation
# Col 16: R-squared, 2nd covariate vs. strata
# Col 17: adjusted R-squared, 2nd covariate vs. strata
# Col 18: P-value, 2nd covariate vs. strata
# Col 19-20: Reserved
# Col 21: Global 1st & 2nd covariate presence count
# Col 22: Global regression slope (2nd vs. 1st covariate)
# Col 23: Global regression slope standard deviation
# Col 24: Global correlation = +/- sqrt(R-squared)
# Col 25: Global residual standard deviation
# Col 26: Global R-squared
# Col 27: Global adjusted R-squared
# Col 28: Global P-value for hypothesis "slope = 0"
# Col 29-30: Reserved
# Col 31: Stratified 1st & 2nd covariate presence count
# Col 32: Stratified regression slope (2nd vs. 1st covariate)
# Col 33: Stratified regression slope standard deviation
# Col 34: Stratified correlation = +/- sqrt(R-squared)
# Col 35: Stratified residual standard deviation
# Col 36: Stratified R-squared
# Col 37: Stratified adjusted R-squared
# Col 38: Stratified P-value for hypothesis "slope = 0"
# Col 39: Number of strata with at least two counted points
# Col 40: Reserved
#
# EXAMPLES:
#
# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/X.mtx Xcid=INPUT_DIR/Xcid.mtx
# Y=INPUT_DIR/Y.mtx Ycid=INPUT_DIR/Ycid.mtx S=INPUT_DIR/S.mtx Scid=1 O=OUTPUT_DIR/Out.mtx fmt=csv
#
# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/Data.mtx Xcid=INPUT_DIR/Xcid.mtx
# Ycid=INPUT_DIR/Ycid.mtx Scid=1 O=OUTPUT_DIR/Out.mtx
fileX = $X;
fileY = ifdef ($Y, " ");
fileS = ifdef ($S, " ");
fileO = $O;
fmtO = ifdef ($fmt, "text");
fileXcid = ifdef ($Xcid, " ");
fileYcid = ifdef ($Ycid, " ");
stratum_column_id = ifdef ($Scid, 1);
print ("BEGIN STRATIFIED STATISTICS SCRIPT");
print ("Reading the input matrices...");
XwithNaNs = read (fileX);
if (fileY != " ") {
YwithNaNs = read (fileY);
} else {
YwithNaNs = XwithNaNs;
}
if (fileS != " ") {
SwithNaNsFull = read (fileS);
SwithNaNs = SwithNaNsFull [, stratum_column_id];
} else {
SwithNaNs = XwithNaNs [, stratum_column_id];
}
if (fileXcid != " ") {
Xcols = read (fileXcid);
} else {
Xcols = t(seq (1, ncol (XwithNaNs), 1));
}
if (fileYcid != " ") {
Ycols = read (fileYcid);
} else {
Ycols = t(seq (1, ncol (YwithNaNs), 1));
}
tXcols = t(Xcols);
tYcols = t(Ycols);
num_records = nrow (XwithNaNs);
num_attrs = ncol (XwithNaNs);
num_attrs_X = ncol (Xcols);
num_attrs_Y = ncol (Ycols);
num_attrs_XY = num_attrs_X * num_attrs_Y;
print ("Preparing the covariates...");
XnoNaNs = replace (target = XwithNaNs, pattern = NaN, replacement = 0);
YnoNaNs = replace (target = YwithNaNs, pattern = NaN, replacement = 0);
XNaNmask = (XwithNaNs == XwithNaNs);
YNaNmask = (YwithNaNs == YwithNaNs);
one_to_num_attrs_X = seq (1, num_attrs_X, 1);
one_to_num_attrs_Y = seq (1, num_attrs_Y, 1);
ProjX = matrix (0, rows = num_attrs, cols = num_attrs_X);
ProjY = matrix (0, rows = num_attrs, cols = num_attrs_Y);
ProjX_ctable = table (tXcols, one_to_num_attrs_X);
ProjX [1 : nrow (ProjX_ctable), ] = ProjX_ctable;
ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
ProjY [1 : nrow (ProjY_ctable), ] = ProjY_ctable;
X = XnoNaNs %*% ProjX;
Y = YnoNaNs %*% ProjY;
X_mask = XNaNmask %*% ProjX;
Y_mask = YNaNmask %*% ProjY;
print ("Preparing the strata...");
SnoNaNs = replace (target = SwithNaNs, pattern = NaN, replacement = 0);
S = round (SnoNaNs) * (SnoNaNs > 0);
Proj_good_stratumID = diag (S > 0);
Proj_good_stratumID = removeEmpty (target = Proj_good_stratumID, margin = "rows");
vector_of_good_stratumIDs = Proj_good_stratumID %*% S;
vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min (vector_of_good_stratumIDs));
num_records_with_good_stratumID = nrow (Proj_good_stratumID);
one_to_num_records_with_good_stratumID = seq (1, num_records_with_good_stratumID, 1);
# Create a group-by summation matrix for records over stratum IDs
# "with_empty" means with stratum IDs that never occur in records
num_strata_with_empty = max (vector_of_good_stratumIDs);
StrataSummator_with_empty = table (vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID);
StrataSummator = removeEmpty (target = StrataSummator_with_empty, margin = "rows");
StrataSummator = StrataSummator %*% Proj_good_stratumID;
num_strata = nrow (StrataSummator);
num_empty_strata = num_strata_with_empty - num_strata;
print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but positive-ID strata.");
print ("Computing the global single-variate statistics...");
cnt_X_global = colSums (X_mask);
cnt_Y_global = colSums (Y_mask);
avg_X_global = colSums (X) / cnt_X_global;
avg_Y_global = colSums (Y) / cnt_Y_global;
var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1);
stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);
print ("Computing the stratified single-variate statistics...");
# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata
Cnt_X_per_stratum = StrataSummator %*% X_mask;
Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
Is_none_X_per_stratum = (Cnt_X_per_stratum == 0);
Is_none_Y_per_stratum = (Cnt_Y_per_stratum == 0);
One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);
Sum_X_per_stratum = StrataSummator %*% X;
Sum_Y_per_stratum = StrataSummator %*% Y;
# Recompute some global statistics to exclude bad stratum-ID records
cnt_X_with_good_stratumID = colSums (Cnt_X_per_stratum);
cnt_Y_with_good_stratumID = colSums (Cnt_Y_per_stratum);
sum_X_with_good_stratumID = colSums (Sum_X_per_stratum);
sum_Y_with_good_stratumID = colSums (Sum_Y_per_stratum);
var_sumX_with_good_stratumID = colSums (StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID;
var_sumY_with_good_stratumID = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID;
# Compute the stratified statistics
var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata);
stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3);
sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4);
r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_with_good_stratumID;
r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_with_good_stratumID;
adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1));
adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1));
fStat_X_vs_strata = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata));
fStat_Y_vs_strata = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata));
p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata);
p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
print ("Computing the global bivariate statistics...");
# Compute the aggregate X vs. Y statistics and map them into proper positions
cnt_XY_rectangle = t(X_mask) %*% Y_mask;
sum_X_forXY_rectangle = t(X) %*% Y_mask;
sum_XX_forXY_rectangle = t(X * X) %*% Y_mask;
sum_Y_forXY_rectangle = t(X_mask) %*% Y;
sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
sum_XY_rectangle = t(X) %*% Y;
cnt_XY_global = matrix (cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_X_forXY_global = matrix (sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_Y_forXY_global = matrix (sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_XY_global = matrix (sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY);
# Compute the global bivariate statistics for output
cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global;
sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5;
r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2);
sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6);
sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7);
fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);
print ("Computing the stratified bivariate statistics...");
# Create projections to "intermingle" X and Y into attribute pairs
Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
ones_Y_col = matrix (1.0, rows = num_attrs_Y, cols = 1);
for (i in 1:num_attrs_X) {
start_cid = (i - 1) * num_attrs_Y + 1;
end_cid = i * num_attrs_Y;
Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col);
Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_col);
}
# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata
Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
Is_none_XY_per_stratum = (Cnt_XY_per_stratum == 0);
One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);
# Recompute some global aggregate X vs. Y statistics to exclude bad stratum-ID records
cnt_XY_with_good_stratumID = colSums (Cnt_XY_per_stratum);
sum_XX_forXY_with_good_stratumID = colSums (Sum_XX_forXY_per_stratum);
sum_YY_forXY_with_good_stratumID = colSums (Sum_YY_forXY_per_stratum);
sum_XY_with_good_stratumID = colSums (Sum_XY_per_stratum);
# Compute the stratified bivariate statistics
var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
cov_sumX_sumY_stratified = sum_XY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8);
corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
temp_X_vs_Y_stratified = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1)
adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata);
sqrt_failsafe_input_9 = temp_X_vs_Y_stratified * var_sumY_forXY_stratified;
stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_9);
sqrt_failsafe_input_10 = sqrt_failsafe_input_9 / var_sumX_forXY_stratified;
stdev_slope_XY_stratified = sqrt_failsafe (sqrt_failsafe_input_10);
fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1);
print ("Preparing the output matrix...");
OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);
OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st covariate column number
OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st covariate global presence count
OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st covariate global mean
OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st covariate global standard deviation
OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st covariate stratified standard deviation
OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st covariate vs. strata
OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY; # adjusted R-squared, 1st covariate vs. strata
OutMtx [ 8, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st covariate vs. strata
OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd covariate column number
OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd covariate global presence count
OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd covariate global mean
OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd covariate global standard deviation
OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd covariate stratified standard deviation
OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd covariate vs. strata
OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # adjusted R-squared, 2nd covariate vs. strata
OutMtx [18, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd covariate vs. strata
OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd covariate presence count
OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd vs. 1st covariate)
OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation
OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared)
OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation
OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared
OutMtx [27, ] = adj_r_sqr_X_vs_Y_global; # Global adjusted R-squared
OutMtx [28, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0"
OutMtx [31, ] = cnt_XY_with_good_stratumID; # Stratified 1st & 2nd covariate presence count
OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd vs. 1st covariate)
OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation
OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared)
OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation
OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared
OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified; # Stratified adjusted R-squared
OutMtx [38, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0"
OutMtx [39, ] = colSums (Cnt_XY_per_stratum >= 2); # Number of strata with at least two counted points
OutMtx = t(OutMtx);
print ("Writing the output matrix...");
write (OutMtx, fileO, format=fmtO);
print ("END STRATIFIED STATISTICS SCRIPT");
fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
{ # TEMPORARY IMPLEMENTATION
tailprob = fStat;
for (i in 1:nrow(fStat)) {
for (j in 1:ncol(fStat)) {
q = as.scalar (fStat [i, j]);
d1 = as.scalar (df_1 [i, j]);
d2 = as.scalar (df_2 [i, j]);
if (d1 >= 1 & d2 >= 1 & q >= 0) {
tailprob [i, j] = pf(target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
} else {
tailprob [i, j] = 0/0;
}
} }
}
sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
{
mask_A = (input_A >= 0);
prep_A = input_A * mask_A;
mask_A = mask_A * (prep_A == prep_A);
prep_A = replace (target = prep_A, pattern = NaN, replacement = 0);
output_A = sqrt (prep_A) / mask_A;
}