""" This module provides :class:`GitIgnoreSpecPattern` which implements Git's `gitignore`_ patterns, and handles edge-cases where Git's behavior differs from what's documented. Git allows including files from excluded directories which appears to contradict the documentation. This is used by :class:`~pathspec.gitignore.GitIgnoreSpec` to fully replicate Git's handling. .. _`gitignore`: https://git-scm.com/docs/gitignore """ from typing import ( Optional) # Replaced by `X | None` in 3.10. from pathspec._typing import ( AnyStr, # Removed in 3.18. assert_unreachable, override) # Added in 3.12. from .base import ( GitIgnorePatternError, _BYTES_ENCODING, _GitIgnoreBasePattern) _DIR_MARK = 'ps_d' """ The regex group name for the directory marker. This is only used by :class:`GitIgnoreSpec`. """ _DIR_MARK_CG = f'(?P<{_DIR_MARK}>/)' """ This regular expression matches the directory marker. """ _DIR_MARK_OPT = f'(?:{_DIR_MARK_CG}|$)' """ This regular expression matches the optional directory marker and sub-path. """ class GitIgnoreSpecPattern(_GitIgnoreBasePattern): """ The :class:`GitIgnoreSpecPattern` class represents a compiled gitignore pattern with special handling for edge-cases to replicate Git's behavior. This is registered under the deprecated name "gitwildmatch" for backward compatibility with v0.12. The registered name will be removed in a future version. """ # Keep the dict-less class hierarchy. __slots__ = () @staticmethod def __normalize_segments( is_dir_pattern: bool, pattern_segs: list[str], ) -> tuple[Optional[list[str]], Optional[str]]: """ Normalize the pattern segments to make processing easier. *is_dir_pattern* (:class:`bool`) is whether the pattern is a directory pattern (i.e., ends with a slash '/'). *pattern_segs* (:class:`list` of :class:`str`) contains the pattern segments. This may be modified in place. Returns a :class:`tuple` containing either: - The normalized segments (:class:`list` of :class:`str`; or :data:`None`). - The regular expression override (:class:`str` or :data:`None`). """ if not pattern_segs[0]: # A pattern beginning with a slash ('/') should match relative to the root # directory. Remove the empty first segment to make the pattern relative # to root. del pattern_segs[0] elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]): # A single segment pattern with or without a trailing slash ('/') will # match any descendant path. This is equivalent to "**/{pattern}". Prepend # double-asterisk segment to make pattern relative to root. if pattern_segs[0] != '**': pattern_segs.insert(0, '**') else: # A pattern without a beginning slash ('/') but contains at least one # prepended directory (e.g., "dir/{pattern}") should match relative to the # root directory. No segment modification is needed. pass if not pattern_segs: # After normalization, we end up with no pattern at all. This must be # because the pattern is invalid. raise ValueError("Pattern normalized to nothing.") if not pattern_segs[-1]: # A pattern ending with a slash ('/') will match all descendant paths if # it is a directory but not if it is a regular file. This is equivalent to # "{pattern}/**". Set empty last segment to a double-asterisk to include # all descendants. pattern_segs[-1] = '**' # EDGE CASE: Collapse duplicate double-asterisk sequences (i.e., '**/**'). # Iterate over the segments in reverse order and remove the duplicate double # asterisks as we go. for i in range(len(pattern_segs) - 1, 0, -1): prev = pattern_segs[i-1] seg = pattern_segs[i] if prev == '**' and seg == '**': del pattern_segs[i] seg_count = len(pattern_segs) if seg_count == 1 and pattern_segs[0] == '**': if is_dir_pattern: # The pattern "**/" will be normalized to "**", but it should match # everything except for files in the root. Special case this pattern. return (None, _DIR_MARK_CG) else: # The pattern "**" will match every path. Special case this pattern. return (None, '.') elif ( seg_count == 2 and pattern_segs[0] == '**' and pattern_segs[1] == '*' ): # The pattern "*" will be normalized to "**/*" and will match every # path. Special case this pattern for efficiency. return (None, '.') elif ( seg_count == 3 and pattern_segs[0] == '**' and pattern_segs[1] == '*' and pattern_segs[2] == '**' ): # The pattern "*/" will be normalized to "**/*/**" which will match every # file not in the root directory. Special case this pattern for # efficiency. if is_dir_pattern: return (None, _DIR_MARK_CG) else: return (None, '/') # No regular expression override, return modified pattern segments. return (pattern_segs, None) @override @classmethod def pattern_to_regex( cls, pattern: AnyStr, ) -> tuple[Optional[AnyStr], Optional[bool]]: """ Convert the pattern into a regular expression. *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a regular expression. Returns a :class:`tuple` containing: - *pattern* (:class:`str`, :class:`bytes` or :data:`None`) is the uncompiled regular expression. - *include* (:class:`bool` or :data:`None`) is whether matched files should be included (:data:`True`), excluded (:data:`False`), or is a null-operation (:data:`None`). """ if isinstance(pattern, str): pattern_str = pattern return_type = str elif isinstance(pattern, bytes): pattern_str = pattern.decode(_BYTES_ENCODING) return_type = bytes else: raise TypeError(f"{pattern=!r} is not a unicode or byte string.") original_pattern = pattern_str del pattern if pattern_str.endswith('\\ '): # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends # with a backslash is followed by a space, do not strip from the left. pass else: # EDGE CASE: Leading spaces should be kept (only trailing spaces should be # removed). Git does not remove leading spaces. pattern_str = pattern_str.rstrip() regex: Optional[str] include: Optional[bool] if not pattern_str: # A blank pattern is a null-operation (neither includes nor excludes # files). return (None, None) elif pattern_str.startswith('#'): # A pattern starting with a hash ('#') serves as a comment (neither # includes nor excludes files). Escape the hash with a backslash to match # a literal hash (i.e., '\#'). return (None, None) elif pattern_str == '/': # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does # not match any file. return (None, None) if pattern_str.startswith('!'): # A pattern starting with an exclamation mark ('!') negates the pattern # (exclude instead of include). Escape the exclamation mark with a back # slash to match a literal exclamation mark (i.e., '\!'). include = False # Remove leading exclamation mark. pattern_str = pattern_str[1:] else: include = True # Split pattern into segments. pattern_segs = pattern_str.split('/') # Check whether the pattern is specifically a directory pattern before # normalization. is_dir_pattern = not pattern_segs[-1] # Normalize pattern to make processing easier. try: pattern_segs, override_regex = cls.__normalize_segments( is_dir_pattern, pattern_segs, ) except ValueError as e: raise GitIgnorePatternError(( f"Invalid git pattern: {original_pattern!r}" )) from e # GitIgnorePatternError if override_regex is not None: # Use regex override. regex = override_regex elif pattern_segs is not None: # Build regular expression from pattern. try: regex_parts = cls.__translate_segments(is_dir_pattern, pattern_segs) except ValueError as e: raise GitIgnorePatternError(( f"Invalid git pattern: {original_pattern!r}" )) from e # GitIgnorePatternError regex = ''.join(regex_parts) else: assert_unreachable(( f"{override_regex=} and {pattern_segs=} cannot both be null." )) # assert_unreachable # Encode regex if needed. out_regex: AnyStr if regex is not None and return_type is bytes: out_regex = regex.encode(_BYTES_ENCODING) else: out_regex = regex return (out_regex, include) @classmethod def __translate_segments( cls, is_dir_pattern: bool, pattern_segs: list[str], ) -> list[str]: """ Translate the pattern segments to regular expressions. *is_dir_pattern* (:class:`bool`) is whether the pattern is a directory pattern (i.e., ends with a slash '/'). *pattern_segs* (:class:`list` of :class:`str`) contains the pattern segments. Returns the regular expression parts (:class:`list` of :class:`str`). """ # Build regular expression from pattern. out_parts = [] need_slash = False end = len(pattern_segs) - 1 for i, seg in enumerate(pattern_segs): if seg == '**': if i == 0: # A normalized pattern beginning with double-asterisks ('**') will # match any leading path segments. out_parts.append('^(?:.+/)?') elif i < end: # A pattern with inner double-asterisks ('**') will match multiple (or # zero) inner path segments. out_parts.append('(?:/.+)?') need_slash = True else: assert i == end, (i, end) # A normalized pattern ending with double-asterisks ('**') will match # any trailing path segments. if is_dir_pattern: out_parts.append(_DIR_MARK_CG) else: out_parts.append('/') else: # Match path segment. if i == 0: # Anchor to root directory. out_parts.append('^') if need_slash: out_parts.append('/') if seg == '*': # Match whole path segment. out_parts.append('[^/]+') else: # Match segment glob pattern. out_parts.append(cls._translate_segment_glob(seg)) if i == end: # A pattern ending without a slash ('/') will match a file or a # directory (with paths underneath it). E.g., "foo" matches "foo", # "foo/bar", "foo/bar/baz", etc. out_parts.append(_DIR_MARK_OPT) need_slash = True return out_parts