apache · gaogaotiantian · Mar 2, 2026 · Mar 3, 2026 · holdenk · Mar 2, 2026
diff --git a/python/pyspark/logger/worker_io.py b/python/pyspark/logger/worker_io.py
@@ -223,7 +223,11 @@ def context_provider() -> dict[str, str]:
             - class_name: Name of the class that initiated the logging if available
     """
 
-    def is_pyspark_module(module_name: str) -> bool:
+    def is_pyspark_module(frame: FrameType) -> bool:
+        module_name = frame.f_globals.get("__name__", "")
+        if module_name == "__main__":
+            if (mod := sys.modules.get("__main__", None)) and mod.__spec__:
+                module_name = mod.__spec__.name
         return module_name.startswith("pyspark.") and ".tests." not in module_name
 
     bottom: Optional[FrameType] = None
@@ -236,9 +240,8 @@ def is_pyspark_module(module_name: str) -> bool:
         if frame:
             while frame.f_back:
                 f_back = frame.f_back
-                module_name = f_back.f_globals.get("__name__", "")
 
-                if is_pyspark_module(module_name):
+                if is_pyspark_module(f_back):
                     if not is_in_pyspark_module:
                         bottom = frame
                         is_in_pyspark_module = True

diff --git a/python/pyspark/sql/tests/test_python_datasource.py b/python/pyspark/sql/tests/test_python_datasource.py
@@ -1237,8 +1237,20 @@ def writer(self, schema, overwrite):
 
                 logs = self.spark.tvf.python_worker_logs()
 
+                # We could get either 1 or 2 "TestJsonWriter.write: abort test" logs because
+                # the operation is time sensitive. When the first partition gets aborted,
+                # the executor will cancel the rest of the tasks. Whether we are able to get
+                # the second log depends on whether the second partition starts before the
+                # cancellation. When we use simple worker, the second log is often missing
+                # because the spawn overhead is large.
+                non_abort_logs = logs.select("level", "msg", "context", "logger").filter(
+                    "msg != 'TestJsonWriter.write: abort test'"
+                )
+                abort_logs = logs.select("level", "msg", "context", "logger").filter(
+                    "msg == 'TestJsonWriter.write: abort test'"
+                )
                 assertDataFrameEqual(
-                    logs.select("level", "msg", "context", "logger"),
+                    non_abort_logs,
                     [
                         Row(
                             level="WARNING",
@@ -1283,21 +1295,24 @@ def writer(self, schema, overwrite):
                                 "TestJsonWriter.__init__: ['abort', 'path']",
                                 {"class_name": "TestJsonDataSource", "func_name": "writer"},
                             ),
-                            (
-                                "TestJsonWriter.write: abort test",
-                                {"class_name": "TestJsonWriter", "func_name": "write"},
-                            ),
-                            (
-                                "TestJsonWriter.write: abort test",
-                                {"class_name": "TestJsonWriter", "func_name": "write"},
-                            ),
                             (
                                 "TestJsonWriter.abort",
                                 {"class_name": "TestJsonWriter", "func_name": "abort"},
                             ),
                         ]
                     ],
                 )
+                assertDataFrameEqual(
+                    abort_logs.dropDuplicates(["msg"]),
+                    [
+                        Row(
+                            level="WARNING",
+                            msg="TestJsonWriter.write: abort test",
+                            context={"class_name": "TestJsonWriter", "func_name": "write"},
+                            logger="test_datasource_writer",
+                        )
+                    ],
+                )
 
     def test_data_source_perf_profiler(self):
         with self.sql_conf({"spark.sql.pyspark.dataSource.profiler": "perf"}):
@@ -1345,6 +1360,12 @@ class PythonDataSourceTests(BasePythonDataSourceTestsMixin, ReusedSQLTestCase):
     ...
 
 
+class PythonDataSourceTestsWithSimpleWorker(PythonDataSourceTests):
+    @classmethod
+    def conf(self):
+        return super().conf().set("spark.python.use.daemon", "false")
+
+
 if __name__ == "__main__":
     from pyspark.testing import main