Skip to content

Commit b346190

Browse files
john-henpawamoy
andauthored
fix: Avoid SyntaxError when loading modules encoded in UTF8 with BOM
Changed the encoding from `utf8` to `utf-8-sig` throughout the code base when reading files, in order to ignore a possible byte-order mark (a.k.a. BOM, code point U+FEFF) at the start of the file. As per [the Python documentation](https://docs.python.org/3/howto/unicode.html#reading-and-writing-unicode-data): > In some areas, it is also convention to use a “BOM” at the start of UTF-8 encoded files; the name is misleading since UTF-8 is not byte-order dependent. The mark simply announces that the file is encoded in UTF-8. For reading such files, use the ‘utf-8-sig’ codec to automatically skip the mark if present. This change won't affect reading UTF8-encoded files without a BOM. Issue-386: #386 PR-387: #387 Co-authored-by: Timothée Mazzucotelli <[email protected]>
1 parent a8c5585 commit b346190

File tree

3 files changed

+16
-5
lines changed

3 files changed

+16
-5
lines changed

src/_griffe/finder.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ def _top_module_name(self, path: Path) -> str:
425425
# TODO: For more robustness, we should load and minify the AST
426426
# to search for particular call statements.
427427
def _is_pkg_style_namespace(init_module: Path) -> bool:
428-
code = init_module.read_text(encoding="utf8")
428+
code = init_module.read_text(encoding="utf-8-sig")
429429
return bool(_re_pkgresources.search(code) or _re_pkgutil.search(code))
430430

431431

@@ -455,7 +455,7 @@ def _handle_pth_file(path: Path) -> list[_SP]:
455455
# It turns out PyTorch recommends its users to use `.pth` as the extension
456456
# when saving models on the disk. These model files are not encoded in UTF8.
457457
# If UTF8 decoding fails, we skip the .pth file.
458-
text = path.read_text(encoding="utf8")
458+
text = path.read_text(encoding="utf-8-sig")
459459
except UnicodeDecodeError:
460460
return directories
461461
for line in text.strip().replace(";", "\n").splitlines(keepends=False):
@@ -476,7 +476,7 @@ def _handle_editable_module(path: Path) -> list[_SP]:
476476
# And how 'scikit-build-core' writes these files:
477477
# example line: `install({'griffe': '/media/data/dev/griffe/src/griffe/__init__.py'}, {'cmake_example': ...}, None, False, True)`.
478478
try:
479-
editable_lines = path.read_text(encoding="utf8").strip().splitlines(keepends=False)
479+
editable_lines = path.read_text(encoding="utf-8-sig").strip().splitlines(keepends=False)
480480
except FileNotFoundError as error:
481481
raise UnhandledEditableModuleError(path) from error
482482
new_path = Path(editable_lines[-1].split("'")[3])

src/_griffe/loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -624,7 +624,7 @@ def _create_module(self, module_name: str, module_path: Path | list[Path]) -> Mo
624624
)
625625

626626
def _visit_module(self, module_name: str, module_path: Path, parent: Module | None = None) -> Module:
627-
code = module_path.read_text(encoding="utf8")
627+
code = module_path.read_text(encoding="utf-8-sig")
628628
if self.store_source:
629629
self.lines_collection[module_path] = code.splitlines(keepends=False)
630630
start = datetime.now(tz=timezone.utc)
@@ -648,7 +648,7 @@ def _inspect_module(self, module_name: str, filepath: Path | None = None, parent
648648
if module_name.startswith(prefix):
649649
raise ImportError(f"Ignored module '{module_name}'")
650650
if self.store_source and filepath and filepath.suffix in {".py", ".pyi"}:
651-
self.lines_collection[filepath] = filepath.read_text(encoding="utf8").splitlines(keepends=False)
651+
self.lines_collection[filepath] = filepath.read_text(encoding="utf-8-sig").splitlines(keepends=False)
652652
start = datetime.now(tz=timezone.utc)
653653
try:
654654
module = inspect(

tests/test_loader.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,3 +508,14 @@ def test_not_overriding_module_with_alias_from_wildcard_import() -> None:
508508
assert pkg["a.m.m"].is_function
509509
assert pkg["b.m"].is_module
510510
assert pkg["b.m.m"].is_alias
511+
512+
513+
def test_loading_utf8_with_bom_files(tmp_path: Path) -> None:
514+
"""Check that the loader can handle UTF-8 files with BOM."""
515+
pkg = tmp_path / "pkg"
516+
pkg.mkdir()
517+
init_file = pkg / "__init__.py"
518+
init_file.write_text("\ufeff# This is a UTF-8 file with BOM\n\ndef func() -> int: ...", encoding="utf-8")
519+
loader = GriffeLoader(search_paths=[tmp_path])
520+
package = loader.load("pkg")
521+
assert "func" in package.members # Just checking all went well, no SyntaxError exceptions raised.

0 commit comments

Comments
 (0)