-
Notifications
You must be signed in to change notification settings - Fork 0
/
pfxMod4.R
executable file
·171 lines (137 loc) · 6.3 KB
/
pfxMod4.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Mod4
# B/S/X vs pitch number
# how does performance change as more pitches against batter?
head(pitchData)
rm(list=ls())
library(sqldf)
library(ggplot2)
load('pfxData_atbatMod.Rdat') # 5000 row and all rows of modified atbat data
load('pfxData_p09.Rdat') # Full monty '09 data, .m, .s, .z
rm(p09.s, p09.z, atbat.m)
p09 <- p09.m # sqldf has problems with the '.' in a df name
rm(p09.m)
# From pfxMod1.R
load('pfxMod1.Rdat') # SwingingStrikes ~ TotalPitches
rm(p, innPitchedDf, mdf)
# From pfxMod3.R
load('pfxMod3.Rdat') # ERA, starting/relief, etc., keep eraDf
rm(erh, erl, up, upp, tdf, df)
rm(freqTable, innPitchedTable, m1)
# table of b/s/h vs pitch number
t <- xtabs( ~ type + pitchNum, data=p09)
head(t)
# Each column shows the probability of a pitch resulting in a Ball, Strike or Hit (X) on the nth pitch to the batter
t.p <- prop.table(t, 2)
t.p <- t(t.p)
tpDf <- as.data.frame(t.p)
tpDf$pitchNum <- as.integer(tpDf$pitchNum)
head(tpDf)
# df <- data.frame(B1=xt[,'Single'],B2=xt[, 'Double'])
# MIKE, what's a good way to get from a table to a df?
# df <- data.frame(PitchNum=1:nrow(t.p), Ball=t.p[, 'B'], Strike=t.p[,'S'], Hit=t.p[,'X'])
# df
# p <- ggplot(data=df, aes(x=PitchNum))
# p <- p + geom_point(aes(y=Ball), colour='blue')
# p <- p + geom_point(aes(y=Strike), colour='red')
# p <- p + geom_point(aes(y=Hit), colour='black')
# p
#
# Great plot showing P(B, S, X | pitchNum)
p1 <- ggplot(data=tpDf, aes(x=pitchNum, y=Freq)) + facet_grid(. ~ type) + geom_point()
p1
# Now, get a lm for each of these cases
# Don't include cases where pitchNum >= 15 - too few samples
ball.lm <- lm(Freq ~ pitchNum, data=subset(tpDf, type=='B' & pitchNum < 15))
strike.lm <- lm(Freq ~ pitchNum, data=subset(tpDf, type=='S' & pitchNum < 15))
hit.lm <- lm(Freq ~ pitchNum, data=subset(tpDf, type=='X' & pitchNum < 15))
summary(ball.lm)
summary(strike.lm)
summary(hit.lm)
###
# MIKE, Is this the best way to do this?
#
df1 <- transform(subset(tpDf, type=='B'), fit=predict(ball.lm, subset(tpDf, type=='B') ))
df2 <- transform(subset(tpDf, type=='S'), fit=predict(strike.lm, subset(tpDf, type=='S') ))
df3 <- transform(subset(tpDf, type=='X'), fit=predict(hit.lm, subset(tpDf, type=='X') ))
df <- rbind(df1, df2)
df <- rbind(df, df3)
#
# Great plot showing P(B, S, X | pitchNum), this time with linear fit on there too.
plot1 <- ggplot(data=df, aes(x=pitchNum, y=Freq, fit)) + facet_grid(. ~ type) + geom_point()
plot1 <- plot1 + geom_line(aes(y=fit), color='red')
plot1
#
#
#
# Add starter/relief to analysis
#
#
#
# This info is in eraDf from pfxMod3.R
# table of b/s/h vs pitch number
p1 <- subset(p09, select=c(pitcher, type, pitchNum))
p1 <- merge(p1, subset(eraDf, select=c(pitcher, startPitcher)))
# use sqldf instead of xtabs here
# t <- xtabs( ~ type + pitchNum, data=p09)
# head(t)
t1 <- sqldf('select startPitcher, type, pitchNum, count() numPitches from p1 group by startPitcher, type, pitchNum')
t1
# Not what I want. divides by all pitches. I want the fraction of pitches for start & relief pitchers
# t1 <- transform(t1, freq=numPitches/sum(numPitches))
# Still not what I wanted - this gives prob(b/s/x/pitchNum) I want prob(b/s/x | pitchNum)
# t1start <- transform(subset(t1, startPitcher==TRUE), Freq=numPitches/sum(numPitches))
# t1relief <- transform(subset(t1, startPitcher==FALSE), Freq=numPitches/sum(numPitches))
head(t1)
xt <- xtabs(numPitches ~ pitchNum + type + startPitcher, data=t1)
# totPitchesRelief <- as.data.frame(margin.table(xt[,,1], 1))
# totPitchesRelief <- data.frame(totPitches=margin.table(xt[, , 1], 1), startPitcher='FALSE', pitchNum)
# totPitchesStart <- data.frame(totPitches=margin.table(xt[, , 2], 1))
# head(t1)
# totPitchesRelief
ptRelief <- prop.table(xt[,,1], 1) # prop table for reliefPitcher
ptStart <- prop.table(xt[,,2], 1) # prop table for starting Pitcher
# ptRelief
# ptStart
# Convert table back to dataframe
dfStart <- as.data.frame(as.table(ptStart))
dfStart$startPitcher <- 'Start'
dfRelief <- as.data.frame(as.table(ptRelief))
dfRelief$startPitcher <- 'Relief'
dfAll <- rbind(dfStart, dfRelief)
dfAll$pitchNum <- as.integer(dfAll$pitchNum)
dfAll
# Great plot showing P(B, S, X | pitchNum), this time with linear fit on there too.
plot2 <- ggplot(data=dfAll, aes(x=pitchNum, y=Freq)) + geom_point() + facet_grid(startPitcher ~ type)
plot2
# Now, get a lm for each of these cases
# Don't include cases where pitchNum >= 15 - too few samples
ballS.lm <- lm(Freq ~ pitchNum, data=subset(dfAll, type=='B' & pitchNum < 15 & startPitcher=='Start'))
strikeS.lm <- lm(Freq ~ pitchNum, data=subset(dfAll, type=='S' & pitchNum < 15 & startPitcher=='Start'))
hitS.lm <- lm(Freq ~ pitchNum, data=subset(dfAll, type=='X' & pitchNum < 15 & startPitcher=='Start'))
ballR.lm <- lm(Freq ~ pitchNum, data=subset(dfAll, type=='B' & pitchNum < 15 & startPitcher=='Relief'))
strikeR.lm <- lm(Freq ~ pitchNum, data=subset(dfAll, type=='S' & pitchNum < 15 & startPitcher=='Relief'))
hitR.lm <- lm(Freq ~ pitchNum, data=subset(dfAll, type=='X' & pitchNum < 15 & startPitcher=='Relief'))
summary(ball.lm)
summary(strike.lm)
summary(hit.lm)
###
# MIKE, Is this the best way to do this?
#
df1 <- transform(subset(dfAll, type=='B' & startPitcher=='Start'), fit=predict(ballS.lm, subset(dfAll, type=='B' & startPitcher=='Start') ))
df2 <- transform(subset(dfAll, type=='B' & startPitcher=='Relief'), fit=predict(ballR.lm, subset(dfAll, type=='B' & startPitcher=='Relief') ))
df3 <- transform(subset(dfAll, type=='S' & startPitcher=='Start'), fit=predict(strikeS.lm, subset(dfAll, type=='S' & startPitcher=='Start') ))
df4 <- transform(subset(dfAll, type=='S' & startPitcher=='Relief'), fit=predict(strikeR.lm, subset(dfAll, type=='S' & startPitcher=='Relief') ))
df5 <- transform(subset(dfAll, type=='X' & startPitcher=='Start'), fit=predict(hitS.lm, subset(dfAll, type=='X' & startPitcher=='Start') ))
df6 <- transform(subset(dfAll, type=='X' & startPitcher=='Relief'), fit=predict(hitR.lm, subset(dfAll, type=='X' & startPitcher=='Relief') ))
df <- rbind(df1, df2)
df <- rbind(df, df3)
df <- rbind(df, df4)
df <- rbind(df, df5)
df <- rbind(df, df6)
#
# Great plot showing P(B, S, X | pitchNum), this time with linear fit on there too.
plot3 <- ggplot(data=df, aes(x=pitchNum, y=Freq, fit)) + facet_grid(startPitcher ~ type) + geom_point()
plot3 <- plot3 + geom_line(aes(y=fit), color='red')
plot3
dfPitchFreqTypeRelief <- df
save(dfPitchFreqTypeRelief, eraDf, file='pfxMod4.Rdat')