2026-02-01 09:31:38 +01:00

336 lines
9.9 KiB
Python

"""
This module provides :class:`GitIgnoreSpecPattern` which implements Git's
`gitignore`_ patterns, and handles edge-cases where Git's behavior differs from
what's documented. Git allows including files from excluded directories which
appears to contradict the documentation. This is used by
:class:`~pathspec.gitignore.GitIgnoreSpec` to fully replicate Git's handling.
.. _`gitignore`: https://git-scm.com/docs/gitignore
"""
from typing import (
Optional) # Replaced by `X | None` in 3.10.
from pathspec._typing import (
AnyStr, # Removed in 3.18.
assert_unreachable,
override) # Added in 3.12.
from .base import (
GitIgnorePatternError,
_BYTES_ENCODING,
_GitIgnoreBasePattern)
_DIR_MARK = 'ps_d'
"""
The regex group name for the directory marker. This is only used by
:class:`GitIgnoreSpec`.
"""
_DIR_MARK_CG = f'(?P<{_DIR_MARK}>/)'
"""
This regular expression matches the directory marker.
"""
_DIR_MARK_OPT = f'(?:{_DIR_MARK_CG}|$)'
"""
This regular expression matches the optional directory marker and sub-path.
"""
class GitIgnoreSpecPattern(_GitIgnoreBasePattern):
"""
The :class:`GitIgnoreSpecPattern` class represents a compiled gitignore
pattern with special handling for edge-cases to replicate Git's behavior.
This is registered under the deprecated name "gitwildmatch" for backward
compatibility with v0.12. The registered name will be removed in a future
version.
"""
# Keep the dict-less class hierarchy.
__slots__ = ()
@staticmethod
def __normalize_segments(
is_dir_pattern: bool,
pattern_segs: list[str],
) -> tuple[Optional[list[str]], Optional[str]]:
"""
Normalize the pattern segments to make processing easier.
*is_dir_pattern* (:class:`bool`) is whether the pattern is a directory
pattern (i.e., ends with a slash '/').
*pattern_segs* (:class:`list` of :class:`str`) contains the pattern
segments. This may be modified in place.
Returns a :class:`tuple` containing either:
- The normalized segments (:class:`list` of :class:`str`; or :data:`None`).
- The regular expression override (:class:`str` or :data:`None`).
"""
if not pattern_segs[0]:
# A pattern beginning with a slash ('/') should match relative to the root
# directory. Remove the empty first segment to make the pattern relative
# to root.
del pattern_segs[0]
elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
# A single segment pattern with or without a trailing slash ('/') will
# match any descendant path. This is equivalent to "**/{pattern}". Prepend
# double-asterisk segment to make pattern relative to root.
if pattern_segs[0] != '**':
pattern_segs.insert(0, '**')
else:
# A pattern without a beginning slash ('/') but contains at least one
# prepended directory (e.g., "dir/{pattern}") should match relative to the
# root directory. No segment modification is needed.
pass
if not pattern_segs:
# After normalization, we end up with no pattern at all. This must be
# because the pattern is invalid.
raise ValueError("Pattern normalized to nothing.")
if not pattern_segs[-1]:
# A pattern ending with a slash ('/') will match all descendant paths if
# it is a directory but not if it is a regular file. This is equivalent to
# "{pattern}/**". Set empty last segment to a double-asterisk to include
# all descendants.
pattern_segs[-1] = '**'
# EDGE CASE: Collapse duplicate double-asterisk sequences (i.e., '**/**').
# Iterate over the segments in reverse order and remove the duplicate double
# asterisks as we go.
for i in range(len(pattern_segs) - 1, 0, -1):
prev = pattern_segs[i-1]
seg = pattern_segs[i]
if prev == '**' and seg == '**':
del pattern_segs[i]
seg_count = len(pattern_segs)
if seg_count == 1 and pattern_segs[0] == '**':
if is_dir_pattern:
# The pattern "**/" will be normalized to "**", but it should match
# everything except for files in the root. Special case this pattern.
return (None, _DIR_MARK_CG)
else:
# The pattern "**" will match every path. Special case this pattern.
return (None, '.')
elif (
seg_count == 2
and pattern_segs[0] == '**'
and pattern_segs[1] == '*'
):
# The pattern "*" will be normalized to "**/*" and will match every
# path. Special case this pattern for efficiency.
return (None, '.')
elif (
seg_count == 3
and pattern_segs[0] == '**'
and pattern_segs[1] == '*'
and pattern_segs[2] == '**'
):
# The pattern "*/" will be normalized to "**/*/**" which will match every
# file not in the root directory. Special case this pattern for
# efficiency.
if is_dir_pattern:
return (None, _DIR_MARK_CG)
else:
return (None, '/')
# No regular expression override, return modified pattern segments.
return (pattern_segs, None)
@override
@classmethod
def pattern_to_regex(
cls,
pattern: AnyStr,
) -> tuple[Optional[AnyStr], Optional[bool]]:
"""
Convert the pattern into a regular expression.
*pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
regular expression.
Returns a :class:`tuple` containing:
- *pattern* (:class:`str`, :class:`bytes` or :data:`None`) is the
uncompiled regular expression.
- *include* (:class:`bool` or :data:`None`) is whether matched files
should be included (:data:`True`), excluded (:data:`False`), or is a
null-operation (:data:`None`).
"""
if isinstance(pattern, str):
pattern_str = pattern
return_type = str
elif isinstance(pattern, bytes):
pattern_str = pattern.decode(_BYTES_ENCODING)
return_type = bytes
else:
raise TypeError(f"{pattern=!r} is not a unicode or byte string.")
original_pattern = pattern_str
del pattern
if pattern_str.endswith('\\ '):
# EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
# with a backslash is followed by a space, do not strip from the left.
pass
else:
# EDGE CASE: Leading spaces should be kept (only trailing spaces should be
# removed). Git does not remove leading spaces.
pattern_str = pattern_str.rstrip()
regex: Optional[str]
include: Optional[bool]
if not pattern_str:
# A blank pattern is a null-operation (neither includes nor excludes
# files).
return (None, None)
elif pattern_str.startswith('#'):
# A pattern starting with a hash ('#') serves as a comment (neither
# includes nor excludes files). Escape the hash with a backslash to match
# a literal hash (i.e., '\#').
return (None, None)
elif pattern_str == '/':
# EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
# not match any file.
return (None, None)
if pattern_str.startswith('!'):
# A pattern starting with an exclamation mark ('!') negates the pattern
# (exclude instead of include). Escape the exclamation mark with a back
# slash to match a literal exclamation mark (i.e., '\!').
include = False
# Remove leading exclamation mark.
pattern_str = pattern_str[1:]
else:
include = True
# Split pattern into segments.
pattern_segs = pattern_str.split('/')
# Check whether the pattern is specifically a directory pattern before
# normalization.
is_dir_pattern = not pattern_segs[-1]
# Normalize pattern to make processing easier.
try:
pattern_segs, override_regex = cls.__normalize_segments(
is_dir_pattern, pattern_segs,
)
except ValueError as e:
raise GitIgnorePatternError((
f"Invalid git pattern: {original_pattern!r}"
)) from e # GitIgnorePatternError
if override_regex is not None:
# Use regex override.
regex = override_regex
elif pattern_segs is not None:
# Build regular expression from pattern.
try:
regex_parts = cls.__translate_segments(is_dir_pattern, pattern_segs)
except ValueError as e:
raise GitIgnorePatternError((
f"Invalid git pattern: {original_pattern!r}"
)) from e # GitIgnorePatternError
regex = ''.join(regex_parts)
else:
assert_unreachable((
f"{override_regex=} and {pattern_segs=} cannot both be null."
)) # assert_unreachable
# Encode regex if needed.
out_regex: AnyStr
if regex is not None and return_type is bytes:
out_regex = regex.encode(_BYTES_ENCODING)
else:
out_regex = regex
return (out_regex, include)
@classmethod
def __translate_segments(
cls,
is_dir_pattern: bool,
pattern_segs: list[str],
) -> list[str]:
"""
Translate the pattern segments to regular expressions.
*is_dir_pattern* (:class:`bool`) is whether the pattern is a directory
pattern (i.e., ends with a slash '/').
*pattern_segs* (:class:`list` of :class:`str`) contains the pattern
segments.
Returns the regular expression parts (:class:`list` of :class:`str`).
"""
# Build regular expression from pattern.
out_parts = []
need_slash = False
end = len(pattern_segs) - 1
for i, seg in enumerate(pattern_segs):
if seg == '**':
if i == 0:
# A normalized pattern beginning with double-asterisks ('**') will
# match any leading path segments.
out_parts.append('^(?:.+/)?')
elif i < end:
# A pattern with inner double-asterisks ('**') will match multiple (or
# zero) inner path segments.
out_parts.append('(?:/.+)?')
need_slash = True
else:
assert i == end, (i, end)
# A normalized pattern ending with double-asterisks ('**') will match
# any trailing path segments.
if is_dir_pattern:
out_parts.append(_DIR_MARK_CG)
else:
out_parts.append('/')
else:
# Match path segment.
if i == 0:
# Anchor to root directory.
out_parts.append('^')
if need_slash:
out_parts.append('/')
if seg == '*':
# Match whole path segment.
out_parts.append('[^/]+')
else:
# Match segment glob pattern.
out_parts.append(cls._translate_segment_glob(seg))
if i == end:
# A pattern ending without a slash ('/') will match a file or a
# directory (with paths underneath it). E.g., "foo" matches "foo",
# "foo/bar", "foo/bar/baz", etc.
out_parts.append(_DIR_MARK_OPT)
need_slash = True
return out_parts