Commit 812e502e authored by Federico Mestrone's avatar Federico Mestrone
Browse files

Minor improvements to Spark process and code

parent b3465391
......@@ -9,15 +9,18 @@ object AdtechClean extends App {
conf.setAppName("Word Count")
val sc = new SparkContext(conf)
val impressions = sc.textFile("gs://abucket-for-codemotion/adtech/test")
// Si può usare GS come file system distribuito nativo al posto di HDFS
val impressions = sc.textFile("gs://abucket-for-codemotion/adtech/test.csv")
val csv = impressions.map(line => line.split(","))
val cleaned = csv.map(rec => rec.take(2) ++ rec.drop(3).take(12))
// val cleaned = csv.map(rec => rec.take(2) ++ rec.drop(3).take(12))
val cleaned = csv.map(rec => rec.take(2) ++ rec.slice(3, 15))
val textfile = cleaned.map(rec => rec.mkString(","))
textfile.saveAsTextFile("gs://abucket-for-codemotion/adtech/test_cleaned")
// textfile.coalesce(1, shuffle = true).saveAsTextFile("gs://abucket-for-codemotion/adtech/test_cleaned")
// Si può usare GS come file system distribuito nativo al posto di HDFS
textfile.saveAsTextFile("gs://abucket-for-codemotion/adtech/test_cleaned.csv")
// textfile.coalesce(1, shuffle = true).saveAsTextFile("gs://abucket-for-codemotion/adtech/test_cleaned.csv")
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment