Linear Regression
Setting Up Linear Regression
Load in required libraries
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
Initialize Linear Regression object
lr = LinearRegression(labelCol="label", featuresCol="features")
Create a parameter grid for tuning the model
lrparamGrid = (ParamGridBuilder()
.addGrid(lr.regParam, [0.001, 0.01, 0.1, 0.5, 1.0, 2.0])
# .addGrid(lr.regParam, [0.01, 0.1, 0.5])
.addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
# .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
.addGrid(lr.maxIter, [1, 5, 10, 20, 50])
# .addGrid(lr.maxIter, [1, 5, 10])
.build())
Define how you want the model to be evaluated
lrevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
Define the type of cross-validation you want to perform
# Create 5-fold CrossValidator
lrcv = CrossValidator(estimator = lr,
estimatorParamMaps = lrparamGrid,
evaluator = lrevaluator,
numFolds = 5)
Fit the model to the data
lrcvModel = lrcv.fit(train)
print(lrcvModel)
Get model information
lrcvSummary = lrcvModel.bestModel.summary
print("Coefficient Standard Errors: " + str(lrcvSummary.coefficientStandardErrors))
print("P Values: " + str(lrcvSummary.pValues)) # Last element is the intercept
Score the testing dataset using your fitted model for evaluation purposes
lrpredictions = lrcvModel.transform(test)
Evaluate the model
print('RMSE:', lrevaluator.evaluate(lrpredictions))
Last updated
Was this helpful?