A PITCHf/x primer Posted by Mike Fast under Uncategorized [6]. MLBAM used the PITCHf/x data in their Enhanced Gameday application and also made the data freely available for downloading and research. If you want to manipulate and analyze a single game’s worth of data, you can download and import the XML files into a Microsoft Excel. My Profile Settings Download the App Sign Out. Xx MB of 10 GB used Scott Lindholm - Profile Favorite Download Workbook. 2018 PITCHf/x Data 349 views| Scott Lindholm. Baseball Savant PITCHf/x Data Pct of called balls in strike zone (BiZ%) plotted against called strikes outside strike zone (SoZ%) Reference lines are medians.
Download pitchfx XML data (haven't tested in years).
DownloadPitchFX.R
Pitchf X System
# DownloadPitchFX.R |
# downloads the massive MLB Gameday data. |
# Version 0.4 |
# Version History |
# 0.5 ~ grab player data, both pitchers and batters, ability to pick team |
# 0.4 ~ get team data, and ability to grab team info, checks to see if regular season |
# 0.3 ~ updated so 2010 works, fixed some bugs, and saves as tab delimited file |
# 0.2 ~ inputs are start and end dates |
# 0.1 ~ grab Pitch f/x data from MLB Gameday, specify date ranges (takes half a minute for a day's worth of data on my 2.5Ghz machine) |
# Future Versions: |
# ~ ability to pick pitchers, batters, teams |
# - ability to grab matchups |
# - better searching instead of tediously parsing through each XML file |
# ~ connect to mysql database |
# ~ don't overheat computer! |
# ~ document Gameday Code |
# downloading pitch f/x data from MLB website |
# Get data from http://gd2.mlb.com/components/game/mlb/ |
# XML package http://www.omegahat.org/RSXML/shortIntro.html |
# Perl script of same application by Mike Fast: |
# http://fastballs.files.wordpress.com/2007/09/hack_28_parser_mikefast_test_pl.txt |
# Less general R code from Erik Iverson of Blogistic Reflections: |
# http://blogisticreflections.wordpress.com/2009/10/04/using-r-to-analyze-baseball-games-in-real-time/ |
# listing of pitch f/x tools by Baseball Analysts |
# http://baseballanalysts.com/archives/2010/03/how_can_i_get_m.php |
# downloadable pitch f/x database from Darrell Zimmerman |
# http://www.wantlinux.net/category/baseball-data/ |
# I think gameday data starts 2005 |
# I think enhanced gameday (pitch fx) has all of 2009, most of 2008, some 2007, tiny bit 2006 |
# required libraries: |
library(XML) |
# code for <game type> in game.xml (input game.type in code) |
# 'S' ~ spring training, 'R' ~ regular season, 'D' ~ Division Series |
# 'L' ~ League Championship Series 'W' ~ World Series |
# code for <game gameday_sw> in game.xml |
# http://sports.dir.groups.yahoo.com/group/RetroSQL/message/320 |
# 'N' ~ missing, no pitch info |
# 'Y' ~ standard w/ pitch locations |
# 'E' ~ w/ pitch f/x |
# 'P' ~ for 2010, whatever that's supposed to mean |
# code for teams |
# code for players |
# code for gameday |
# code for pitch type |
# code for atbat type |
# checks for: |
# gameday type |
# home, away |
# player, batter, pitch type |
# ----------------------------------------------------------- |
DownloadPitchFX<-function(fileloc='./pitchfx.txt', |
start.date='2009-05-02', end.date=start.date, |
URL.base='http://gd2.mlb.com/components/game/mlb/', |
game.type='R', |
grab.pitch= c('des', 'type', 'x', 'y', |
'start_speed', 'end_speed', |
'sz_top', 'sz_bot', 'pfx_x', 'pfx_z', 'px', 'pz', |
'x0', 'y0', 'z0', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', |
'break_y', 'break_angle', 'break_length', 'pitch_type', |
'type_confidence'), |
grab.atbat= c('b', 's', 'o', 'batter', 'pitcher', 'b_height', |
'stand', 'p_throws', 'event')) { |
# write initial variables on file |
meta<- c('Year', 'Month', 'Day', 'Inning', 'Home', 'Away') |
write(c(meta, grab.atbat, grab.pitch), file=fileloc, |
ncol= length(c(grab.atbat, grab.pitch)) + length(meta), sep='t') |
# transfer date info |
start.date<- as.POSIXlt(start.date); end.date<- as.POSIXlt(end.date); |
diff.date<- as.numeric(difftime(end.date, start.date)) |
date.range<- as.POSIXlt(seq(start.date, by='days', |
length=1+diff.date)) |
for (iin1:(diff.date+1)) { |
year<-date.range[i]$year+1900 |
month<-date.range[i]$mon+1 |
day<-date.range[i]$mday |
URL.date<- paste(URL.base, 'year_', year, '/', |
ifelse(month>=10, 'month_', 'month_0'), month, '/', |
ifelse(day>=10, 'day_', 'day_0'), day, '/', sep='') |
# grab matchups for today |
## URL.scoreboard <- paste(URL.date, 'miniscoreboard.xml', sep = ') |
## XML.scoreboard <- xmlInternalTreeParse(URL.scoreboard) |
## parse.scoreboard <- xpathSApply(XML.scoreboard, '//game[@gameday_link]', |
## xmlGetAttr, 'gameday_link') |
HTML.day<- htmlParse(URL.date) |
parse.day<- xpathSApply(HTML.day, '//a[@*]', xmlGetAttr, 'href') |
parse.day<-parse.day[grep('^gid_*', parse.day)] |
# if games exists today |
if (length(parse.day) >=1) { |
# for each game |
for (gamein1:length(parse.day)) { |
print(game) |
URL.game<- paste(URL.date, parse.day[game], sep='') |
HTML.game<- htmlParse(URL.game) |
parse.game.exists<- xpathSApply(HTML.game, '//a[@*]', xmlGetAttr, 'href') |
# if game.xml exists |
if (sum(match(parse.game.exists, 'game.xml'), na.rm=T) >0) { |
# grab game type (regular season, etc.) and gameday type (pitch f/x, etc.) |
XML.game<- xmlInternalTreeParse(paste(URL.game, 'game.xml', sep='')) |
parse.game<- sapply(c('type', 'gameday_sw'), function (x) |
xpathSApply(XML.game, '//game[@*]', xmlGetAttr, x)) |
# if proper game type: 'R' ~ regular season, 'S' ~ spring, 'D' ~ divison series |
# 'L' ~ league chamption series, 'W' ~ world series |
if (parse.game['type'] game.type) { |
# grab team names |
parse.teams<- sapply(c('abbrev'), function (x) |
xpathSApply(XML.game, '//team[@*]', xmlGetAttr, x)) |
home<-parse.teams[1]; away<-parse.teams[2] |
# if pitch f/x data exists |
if (parse.game['gameday_sw'] 'E'|parse.game['gameday_sw'] 'P') { |
# grab number of innings played |
HTML.Ninnings<- htmlParse(paste(URL.game, 'inning/', sep='')) |
parse.Ninnings<- xpathSApply(HTML.Ninnings, '//a[@*]', xmlGetAttr, 'href') |
# check to see if game exists data by checking innings > 1 |
if (length(grep('^inning_[0-9]', parse.Ninnings)) >1) { |
# for each inning |
for (inningin1:length(grep('^inning_[0-9]', parse.Ninnings))) { |
# grab inning info |
URL.inning<- paste(URL.game, 'inning/', 'inning_', inning, |
'.xml', sep='') |
XML.inning<- xmlInternalTreeParse(URL.inning) |
parse.atbat<- xpathSApply(XML.inning, '//atbat[@*]') |
parse.Npitches.atbat<- sapply(parse.atbat, function(x) |
sum(names(xmlChildren(x)) 'pitch')) |
# check to see if atbat exists |
if (length(parse.atbat) >0) { |
print(paste(parse.day[game], 'inning =', inning)) |
# parse attributes from pitch and atbat (ugh, ugly) |
parse.pitch<- sapply(grab.pitch, function(x) |
as.character(xpathSApply(XML.inning, '//pitch[@*]', |
xmlGetAttr, x))) |
parse.pitch<-if (class(parse.pitch) 'character') { |
t(parse.pitch) |
} else apply(parse.pitch, 2, as.character) |
results.atbat<- t(sapply(parse.atbat, function(x) |
xmlAttrs(x)[grab.atbat])) |
results.atbat<-results.atbat[rep(seq(nrow(results.atbat)), |
times=parse.Npitches.atbat),] |
results.atbat<-if (class(results.atbat) 'character') { |
t(results.atbat) |
} elseresults.atbat |
## parse.pitch <- sapply(grab.pitch, function(x) |
## xpathSApply(XML.inning, '//pitch[@*]', |
## xmlGetAttr, x)) |
## parse.pitch <- apply(parse.pitch, 2, as.character) |
## results.atbat <- t(sapply(parse.atbat, function(x) |
## xmlAttrs(x)[grab.atbat])) |
## results.atbat <- results.atbat[rep(seq(nrow(results.atbat)), |
## times = parse.Npitches.atbat),] |
# write results |
write(t(cbind(year, month, day, inning, home, away, |
results.atbat, parse.pitch)), file=fileloc, |
ncol= length(c(grab.atbat, grab.pitch)) + length(meta), |
append=T, sep='t') |
} |
} |
} |
} |
} |
} |
} |
} |
} |
} |
Pitch X Data Download Software
Sign up for freeto join this conversation on GitHub. Already have an account? Sign in to comment