ML.NET已经发布了v0.2版本,新增了聚类训练器,执行性能进一步增强。本文将介绍一种特殊的回归——泊松回归,并以NBA比赛得分预测的案例来演练。
泊松回归 Poisson regression
前面的文章已提过,回归是用来预测连续值的,泊松回归是其中一种,其特殊在仅用于预测正整数,通常为计数类的数值。泊松分布是离散分布,所以特征值和标签值应为相同(或接近相同)时间间隔下的独立随机事件。
那么什么场景是符合计数,可以适用泊松回归呢?举几个例子,比如共享单车的调度,每一处地域中心,每隔1小时都要统计借车和还车数,根据这个统计我们就可以预测下一个小时此处地域需要调配多少车辆才能满足需要。再比如,公司每个月都有离职员工,那么人力资源部门就可以对月人员流失数进行计数,然后通过泊松回归来预测下个月的流失情况,以便提早采取措施做好招聘计划。
是不是有一点感觉了,本次我们用大家喜欢的NBA比赛得分来进行演练,因为比赛得分正好也是一种计数,也符合连续相同时间间隔(比赛时长的大体相近),比赛结果具有不确定性,所以也是泊松回归大显身手的地方,为了易于理解,我将示范预测的是主场球队的得分。
NBA比赛数据
本案例数据来源Kaggle.com,内容是NBA Team Game Stats from 2014 to 2018,这份数据集收集了最近4年的NBA比赛,格式类似如下:
各字段如下:
比赛基本信息:主场Team,比赛场次序号Game,比赛日期Date,主队Home,客队Opponent,主队胜负Win or Loss。
比赛主客队技术数据:Team Points,Field Goals,Field Goals Attempted,Field Goals Percentage,3 Point Shots,3 Point Shots Attempted,3 Point Shots Percentage,Free Throws,Free Throws Attempted,Free Throws Percentage,Offensive Rebounds,Total Rebounds,Assists,Steals,Blocks,Turnovers,Total Fouls。
这些指标反映了主客队投篮出手次数、命中数、命中率,三分球的出手次数、命中数、命中率,罚球的出手次数、命中数、命中率,助攻,抢断,犯规等,这些都是我们在看NBA时常见的统计。
由于只有这一份数据,为了分别用于训练、评估和预测,我将数据集按7:2:1的比例进行分割。
代码片段分解
定义原始数据结构、预测数据结构,TeamPoints是主队得分,是本次示例要预测的目标,因此定义为标签字段。
加载数据部分
const string DATA_PATH = "data/nba.games.stats.csv";
static ICollection<Match> LoadData(){ var matches = new List<Match>();
using (var sr = new StreamReader(File.OpenRead(DATA_PATH)))
{
sr.ReadLine();
while (!sr.EndOfStream)
{
var line = sr.ReadLine();
var values = line.Split(",");
var match = new Match
{
Id = values[0].Trim('"'),
Team = values[1].Trim('"'),
Game = values[2].Trim('"'),
Date = values[3].Trim('"'),
Home = values[4].Trim('"'),
Opponent = values[5].Trim('"'),
WINorLOSS = values[6].Trim('"'),
TeamPoints = Convert.ToSingle(values[7].Trim('"')),
OpponentPoints = Convert.ToSingle(values[8].Trim('"')),
FieldGoals = Convert.ToSingle(values[9].Trim('"')),
FieldGoalsAttempted = Convert.ToSingle(values[10].Trim('"')),
FieldGoals_ = Convert.ToSingle(values[11].Trim('"')),
X3PointShots = Convert.ToSingle(values[12].Trim('"')),
X3PointShotsAttempted = Convert.ToSingle(values[13].Trim('"')),
X3PointShots_ = Convert.ToSingle(values[14].Trim('"')),
FreeThrows = Convert.ToSingle(values[15].Trim('"')),
FreeThrowsAttempted = Convert.ToSingle(values[16].Trim('"')),
FreeThrows_ = Convert.ToSingle(values[17].Trim('"')),
OffRebounds = Convert.ToSingle(values[18].Trim('"')),
TotalRebounds = Convert.ToSingle(values[19].Trim('"')),
Assists = Convert.ToSingle(values[20].Trim('"')),
Steals = Convert.ToSingle(values[21].Trim('"')),
Blocks = Convert.ToSingle(values[22].Trim('"')),
Turnovers = Convert.ToSingle(values[23].Trim('"')),
TotalFouls = Convert.ToSingle(values[24].Trim('"')),
Opp_FieldGoals = Convert.ToSingle(values[25].Trim('"')),
Opp_FieldGoalsAttempted = Convert.ToSingle(values[26].Trim('"')),
Opp_FieldGoals_ = Convert.ToSingle(values[27].Trim('"')),
Opp_3PointShots = Convert.ToSingle(values[28].Trim('"')),
Opp_3PointShotsAttempted = Convert.ToSingle(values[29].Trim('"')),
Opp_3PointShots_ = Convert.ToSingle(values[30].Trim('"')),
Opp_FreeThrows = Convert.ToSingle(values[31].Trim('"')),
Opp_FreeThrowsAttempted = Convert.ToSingle(values[32].Trim('"')),
Opp_FreeThrows_ = Convert.ToSingle(values[33].Trim('"')),
Opp_OffRebounds = Convert.ToSingle(values[34].Trim('"')),
Opp_TotalRebounds = Convert.ToSingle(values[35].Trim('"')),
Opp_Assists = Convert.ToSingle(values[36].Trim('"')),
Opp_Steals = Convert.ToSingle(values[37].Trim('"')),
Opp_Blocks = Convert.ToSingle(values[38].Trim('"')),
Opp_Turnovers = Convert.ToSingle(values[39].Trim('"')),
Opp_TotalFouls = Convert.ToSingle(values[40].Trim('"'))
};
matches.Add(match);
}
} return matches;
}
训练、评估、预测部分
static PredictionModel<Match, MatchPrediction> Train(IEnumerable<Match> trainData)
{
var pipeline = new LearningPipeline();
pipeline.Add(CollectionDataSource.Create(trainData));
pipeline.Add(new ColumnDropper() { Column = new[] { "Id" } });
pipeline.Add(new CategoricalOneHotVectorizer("Team", "Game", "Date", "Home", "Opponent", "WINorLOSS"));
pipeline.Add(new ColumnConcatenator("Features", "Team", "Game", "Date", "Home", "Opponent", "WINorLOSS", "OpponentPoints", "FieldGoals", "FieldGoalsAttempted", "FieldGoals_", "X3PointShots", "X3PointShotsAttempted", "X3PointShots_", "FreeThrows", "FreeThrowsAttempted", "FreeThrows_", "OffRebounds", "TotalRebounds", "Assists", "Steals", "Blocks", "Turnovers", "TotalFouls", "Opp_FieldGoals", "Opp_FieldGoalsAttempted", "Opp_FieldGoals_", "Opp_3PointShots", "Opp_3PointShotsAttempted", "Opp_3PointShots_", "Opp_FreeThrows", "Opp_FreeThrowsAttempted", "Opp_FreeThrows_", "Opp_OffRebounds", "Opp_TotalRebounds", "Opp_Assists", "Opp_Steals", "Opp_Blocks", "Opp_Turnovers", "Opp_TotalFouls"));
pipeline.Add(new PoissonRegressor());
var model = pipeline.Train<Match, MatchPrediction>();
return model;
}
static void Evaluate(PredictionModel<Match, MatchPrediction> model, IEnumerable<Match> evaluateData)
{
var evaluator = new RegressionEvaluator();
var metric = evaluator.Evaluate(model, CollectionDataSource.Create(evaluateData));
Console.WriteLine("LossFn: {0}", metric.LossFn);
Console.WriteLine("RSquared: {0}", metric.RSquared);
Console.WriteLine("Rms: {0}", metric.Rms);
}
static void Predict(PredictionModel<Match, MatchPrediction> model, IEnumerable<Match> predictData)
{
var predicts = model.Predict(predictData);
var results = predictData.Zip(predicts, (d, p) => (d, p));
foreach (var result in results)
{
Console.WriteLine("Date: {0}, Team: {1} Opponent: {2}, Score: {3}-{4}, Predict Home Score: {5}",
result.d.Date, result.d.Team, result.d.Opponent, result.d.TeamPoints, result.d.OpponentPoints, result.p.TeamPoints);
}
}
最后是Main调用部分
执行结果
结尾
可以看到,最近的NBA比赛主队预测得分与真实结果对比,正确率已相当可观了,由于特征值都是比赛技术数据,用在以后的比赛时,可根据比赛进行的实时情况不断更新,便可越来越接近结果。
对球迷来说这可是一件神器呀。想想2018世界杯也马上要开始了,保罗、阿喀琉斯什么的都弱爆了,相信小伙伴们也要尝试一下ML.NET的套路了吧,记得拿到历年完整的数据哟!
相关文章:
原文地址: https://www.cnblogs.com/BeanHsiang/p/9163591.html