Comment on page
Linear Regression
Note: Make sure you have your training and test data already vectorized and ready to go before you begin trying to fit the machine learning model to unprepped data.
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
lr = LinearRegression(labelCol="label", featuresCol="features")
lrparamGrid = (ParamGridBuilder()
.addGrid(lr.regParam, [0.001, 0.01, 0.1, 0.5, 1.0, 2.0])
# .addGrid(lr.regParam, [0.01, 0.1, 0.5])
.addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
# .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
.addGrid(lr.maxIter, [1, 5, 10, 20, 50])
# .addGrid(lr.maxIter, [1, 5, 10])
.build())
lrevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
# Create 5-fold CrossValidator
lrcv = CrossValidator(estimator = lr,
estimatorParamMaps = lrparamGrid,
evaluator = lrevaluator,
numFolds = 5)
lrcvModel = lrcv.fit(train)
print(lrcvModel)
lrcvSummary = lrcvModel.bestModel.summary
print("Coefficient Standard Errors: " + str(lrcvSummary.coefficientStandardErrors))
print("P Values: " + str(lrcvSummary.pValues)) # Last element is the intercept
lrpredictions = lrcvModel.transform(test)
print('RMSE:', lrevaluator.evaluate(lrpredictions))
Note: When you use the
CrossValidator
function to set up cross-validation of your models, the resulting model object will have all the runs included, but will only use the best model when you interact with the model object using other functions like evaluate
or transform
.Last modified 4yr ago