From 58d770c160d1c0162763d68e4928ee487a33e1a8 Mon Sep 17 00:00:00 2001 From: Matthew Powers Date: Fri, 12 May 2023 14:01:22 -0400 Subject: [PATCH 1/4] Unsuccessfully suppress CRC files --- dat/spark_builder.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dat/spark_builder.py b/dat/spark_builder.py index 1c4520a3..27f5dfc4 100644 --- a/dat/spark_builder.py +++ b/dat/spark_builder.py @@ -18,4 +18,7 @@ def get_spark_session(): 'org.apache.spark.sql.delta.catalog.DeltaCatalog', ) builder = delta.configure_spark_with_delta_pip(builder) - return builder.getOrCreate() + spark = builder.enableHiveSupport().getOrCreate() + spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") + # spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") + return spark From 74550177670451fa8c350be9f412c705f50f4364 Mon Sep 17 00:00:00 2001 From: Matthew Powers Date: Mon, 15 May 2023 15:30:06 -0400 Subject: [PATCH 2/4] Correctly suppress CRC Parquet files --- dat/spark_builder.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dat/spark_builder.py b/dat/spark_builder.py index 27f5dfc4..ef8ab465 100644 --- a/dat/spark_builder.py +++ b/dat/spark_builder.py @@ -19,6 +19,13 @@ def get_spark_session(): ) builder = delta.configure_spark_with_delta_pip(builder) spark = builder.enableHiveSupport().getOrCreate() - spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") - # spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") + hadoop = spark.sparkContext._jvm.org.apache.hadoop # type: ignore + hadoop_conf = spark._jsc.hadoopConfiguration() # type: ignore + fs = hadoop.fs.FileSystem.get(hadoop_conf) # type: ignore + fs.setWriteChecksum(False) return spark + + # spark = builder.enableHiveSupport().getOrCreate() + # spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") + # # spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") + # return spark From bff81299899ba351452427209080c4cc07bdaaa8 Mon Sep 17 00:00:00 2001 From: Matthew Powers Date: Mon, 15 May 2023 15:32:32 -0400 Subject: [PATCH 3/4] Don't lint some lines --- dat/spark_builder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dat/spark_builder.py b/dat/spark_builder.py index ef8ab465..efcc0c4c 100644 --- a/dat/spark_builder.py +++ b/dat/spark_builder.py @@ -25,7 +25,7 @@ def get_spark_session(): fs.setWriteChecksum(False) return spark - # spark = builder.enableHiveSupport().getOrCreate() - # spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") - # # spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") - # return spark + # spark = builder.enableHiveSupport().getOrCreate() # type: ignore + # spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") # type: ignore + # # spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") # type: ignore + # return spark # type: ignore From ec566ca7ebb422fa4e72123eca43ccac6ae88e6e Mon Sep 17 00:00:00 2001 From: Matthew Powers Date: Mon, 15 May 2023 15:50:52 -0400 Subject: [PATCH 4/4] Try to make linter happy again --- dat/spark_builder.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/dat/spark_builder.py b/dat/spark_builder.py index efcc0c4c..4eb6743f 100644 --- a/dat/spark_builder.py +++ b/dat/spark_builder.py @@ -24,8 +24,3 @@ def get_spark_session(): fs = hadoop.fs.FileSystem.get(hadoop_conf) # type: ignore fs.setWriteChecksum(False) return spark - - # spark = builder.enableHiveSupport().getOrCreate() # type: ignore - # spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") # type: ignore - # # spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") # type: ignore - # return spark # type: ignore