fix: Avoid SyntaxError when loading modules encoded in UTF8 with BOM

john-hen · pawamoy · web-flow · commit b3461901ae08 · 2025-07-21T20:11:27.000+02:00
Changed the encoding from `utf8` to `utf-8-sig` throughout the code base when reading files, in order to ignore a possible byte-order mark (a.k.a. BOM, code point U+FEFF) at the start of the file. As per [the Python documentation](https://docs.python.org/3/howto/unicode.html#reading-and-writing-unicode-data): > In some areas, it is also convention to use a “BOM” at the start of UTF-8 encoded files; the name is misleading since UTF-8 is not byte-order dependent. The mark simply announces that the file is encoded in UTF-8. For reading such files, use the ‘utf-8-sig’ codec to automatically skip the mark if present. This change won't affect reading UTF8-encoded files without a BOM. Issue-386: #386 PR-387: #387 Co-authored-by: Timothée Mazzucotelli <dev@pawamoy.fr>
diff --git a/src/_griffe/finder.py b/src/_griffe/finder.py
@@ -425,7 +425,7 @@ def _top_module_name(self, path: Path) -> str:
 # TODO: For more robustness, we should load and minify the AST
 # to search for particular call statements.
 def _is_pkg_style_namespace(init_module: Path) -> bool:
-    code = init_module.read_text(encoding="utf8")
+    code = init_module.read_text(encoding="utf-8-sig")
     return bool(_re_pkgresources.search(code) or _re_pkgutil.search(code))
 
 
@@ -455,7 +455,7 @@ def _handle_pth_file(path: Path) -> list[_SP]:
         # It turns out PyTorch recommends its users to use `.pth` as the extension
         # when saving models on the disk. These model files are not encoded in UTF8.
         # If UTF8 decoding fails, we skip the .pth file.
-        text = path.read_text(encoding="utf8")
+        text = path.read_text(encoding="utf-8-sig")
     except UnicodeDecodeError:
         return directories
     for line in text.strip().replace(";", "\n").splitlines(keepends=False):
@@ -476,7 +476,7 @@ def _handle_editable_module(path: Path) -> list[_SP]:
         # And how 'scikit-build-core' writes these files:
         # example line: `install({'griffe': '/media/data/dev/griffe/src/griffe/__init__.py'}, {'cmake_example': ...}, None, False, True)`.
         try:
-            editable_lines = path.read_text(encoding="utf8").strip().splitlines(keepends=False)
+            editable_lines = path.read_text(encoding="utf-8-sig").strip().splitlines(keepends=False)
         except FileNotFoundError as error:
             raise UnhandledEditableModuleError(path) from error
         new_path = Path(editable_lines[-1].split("'")[3])
diff --git a/src/_griffe/loader.py b/src/_griffe/loader.py
@@ -624,7 +624,7 @@ def _create_module(self, module_name: str, module_path: Path | list[Path]) -> Mo
         )
 
     def _visit_module(self, module_name: str, module_path: Path, parent: Module | None = None) -> Module:
-        code = module_path.read_text(encoding="utf8")
+        code = module_path.read_text(encoding="utf-8-sig")
         if self.store_source:
             self.lines_collection[module_path] = code.splitlines(keepends=False)
         start = datetime.now(tz=timezone.utc)
@@ -648,7 +648,7 @@ def _inspect_module(self, module_name: str, filepath: Path | None = None, parent
             if module_name.startswith(prefix):
                 raise ImportError(f"Ignored module '{module_name}'")
         if self.store_source and filepath and filepath.suffix in {".py", ".pyi"}:
-            self.lines_collection[filepath] = filepath.read_text(encoding="utf8").splitlines(keepends=False)
+            self.lines_collection[filepath] = filepath.read_text(encoding="utf-8-sig").splitlines(keepends=False)
         start = datetime.now(tz=timezone.utc)
         try:
             module = inspect(
diff --git a/tests/test_loader.py b/tests/test_loader.py
@@ -508,3 +508,14 @@ def test_not_overriding_module_with_alias_from_wildcard_import() -> None:
         assert pkg["a.m.m"].is_function
         assert pkg["b.m"].is_module
         assert pkg["b.m.m"].is_alias
+
+
+def test_loading_utf8_with_bom_files(tmp_path: Path) -> None:
+    """Check that the loader can handle UTF-8 files with BOM."""
+    pkg = tmp_path / "pkg"
+    pkg.mkdir()
+    init_file = pkg / "__init__.py"
+    init_file.write_text("\ufeff# This is a UTF-8 file with BOM\n\ndef func() -> int: ...", encoding="utf-8")
+    loader = GriffeLoader(search_paths=[tmp_path])
+    package = loader.load("pkg")
+    assert "func" in package.members  # Just checking all went well, no SyntaxError exceptions raised.