-
Notifications
You must be signed in to change notification settings - Fork 0
/
8-reporting.R
85 lines (66 loc) · 2.35 KB
/
8-reporting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Reporteo
# Código literado
rm(list=ls())
setwd("/home/soyyo/projects/Modelos de datos/datosR")
library(kernlab)
library(boot)
## Cargar el dataset de SPAM
data(spam)
str(spam[, 1:5])
## Realizar un submuestreo para entrenamiento y prueba
set.seed(3435)
trainIndicator = rbinom(4601, size = 1, prob = 0.5)
str(trainIndicator)
table(trainIndicator)
spam <- cbind(spam, trainIndicator)
str(spam)
spam$trainIndicator
trainSpam = spam[which(trainIndicator == 1), ]
testSpam = spam[which(trainIndicator == 0), ]
## Inspeccionar el set de entrenamiento
head(names(trainSpam), 20)
head(trainSpam[, 1:10])
table(trainSpam$type)
boxplot(capitalAve ~ type, data = trainSpam)
boxplot(log10(capitalAve + 1) ~ type, data = trainSpam)
pairs(log10(trainSpam[, 1:4] + 1))
hCluster = hclust(dist(t(trainSpam[, 1:57])))
plot(hCluster)
hClusterUpdated = hclust(dist(t(log10(trainSpam[, 1:55] + 1))))
plot(hClusterUpdated)
## Inspeccionar el set de pruebas
head(names(testSpam), 20)
head(testSpam[, 1:10])
table(testSpam$type)
boxplot(capitalAve ~ type, data = testSpam)
boxplot(log10(capitalAve + 1) ~ type, data = testSpam)
pairs(log10(testSpam[, 1:4] + 1))
hCluster = hclust(dist(t(testSpam[, 1:57])))
plot(hCluster)
hClusterUpdated = hclust(dist(t(log10(testSpam[, 1:55] + 1))))
plot(hClusterUpdated)
## Entrenar al modelo de predicción de SPAM
str(trainSpam$type)
trainSpam$numType = as.numeric(trainSpam$type) - 1
costFuction = function(x, y) sum(x != (y > 0.5))
cvError = rep(NA, 55)
for (i in 1:55) {
lmFormula = reformulate(names(trainSpam)[i], response = "numType")
glmFit = glm(lmFormula, family = "binomial", data = trainSpam)
cvError[i] = cv.glm(trainSpam, glmFit, costFuction, 2)$delta[2]
}
warnings()
## ¿Qué predictor tiene el mínimo error de validación cruzada?
names(trainSpam)[which.min(cvError)]
## Usar el mejor modelo del grupo.
predictionModel = glm(numType ~ charDollar, family = "binomial", data = trainSpam)
## Tomar las predicciones del mejor set.
predictionTest = predict(predictionModel, testSpam)
predictedSpam = rep("nonspam", dim(testSpam)[1])
## Clasificar como 'SPAM' aquellos con probabilidad > 0.5
predictedSpam[predictionModel$fitted > 0.5] = "spam"
table(predictedSpam, testSpam$type)
## Error rate
(61 + 458)/(1346 + 458 + 61 + 449)
# Para que sean reproducibles nuestros experimentos es necesario documentar el ambiente en que ocurrieron.
sessionInfo()