Commit 86259ddf authored by Federico Mestrone's avatar Federico Mestrone
Browse files

Easier to change bucket and file info in spark job

parent 812e502e
gcp.bucket=gcp-bigdata-bucket
spark.in-file=adtech/test.csv
spark.out-file=adtech/test_cleaned.csv
package gcp.cm.bigdata.adtech.spark
import java.util.Properties
import org.apache.spark.{SparkConf, SparkContext}
object AdtechClean extends App {
val props = new Properties
// props.load(AdtechClean.getClass.getResourceAsStream("/gcp.properties"))
props.load(getClass.getResourceAsStream("/gcp.properties"))
val bucketName = props.getProperty("gcp.bucket")
val inFile = props.getProperty("spark.in-file")
val outFile = props.getProperty("spark.out-file")
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("Word Count")
val sc = new SparkContext(conf)
// Si può usare GS come file system distribuito nativo al posto di HDFS
val impressions = sc.textFile("gs://abucket-for-codemotion/adtech/test.csv")
val impressions = sc.textFile(s"gs://$bucketName/$inFile")
val csv = impressions.map(line => line.split(","))
......@@ -20,7 +30,7 @@ object AdtechClean extends App {
val textfile = cleaned.map(rec => rec.mkString(","))
// Si può usare GS come file system distribuito nativo al posto di HDFS
textfile.saveAsTextFile("gs://abucket-for-codemotion/adtech/test_cleaned.csv")
// textfile.coalesce(1, shuffle = true).saveAsTextFile("gs://abucket-for-codemotion/adtech/test_cleaned.csv")
textfile.saveAsTextFile(s"gs://$bucketName/$outFile")
// textfile.coalesce(1, shuffle = true).saveAsTextFile(s"gs://$bucketName/$outFile")
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment