Using Parquet
Import Dependencies
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import com.databricks.spark.csv._
Creating DataFrame from CSV
val conf = new SparkConf().setAppName("DataFrameToParquet").setMaster("local[2]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
// Reading the CSV file StudentData.csv
val studentsDF = sqlContext.csvFile(filePath = "StudentData.csv", useHeader = true, delimiter = '|')
Writing the DataFrame to Parquet
// Writing DataFrame to Parquet
studentsDF.write.parquet("StudentDataPQ.parquet")
Reading Parquet to Dataframe
val pqDFrame = sqlContext.read.parquet("StudentDataPQ.parquet")
pqDFrame.registerTempTable("StudentDataTable")
val results = sqlContext.sql("select phone from StudentDataTable")
results.map(t => "Phone" + t(0)).collect().foreach(println)
Output
Phone: 1-754-775-9024
Phone: 1-527-990-8606
Phone: 1-155-575-9346
Phone: 1-712-794-8145
Phone: 1-106-653-2899
Phone: 1-352-299-2056
Phone: 1-461-925-7084
Phone: 1-786-210-7819
Phone: 1-312-680-5112
...