Initial commit
This commit is contained in:
145
backend/venv/Lib/site-packages/nltk/tokenize/__init__.py
Normal file
145
backend/venv/Lib/site-packages/nltk/tokenize/__init__.py
Normal file
@@ -0,0 +1,145 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# Contributors: matthewmc, clouds56
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
r"""
|
||||
NLTK Tokenizer Package
|
||||
|
||||
Tokenizers divide strings into lists of substrings. For example,
|
||||
tokenizers can be used to find the words and punctuation in a string:
|
||||
|
||||
>>> from nltk.tokenize import word_tokenize
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||||
... two of them.\n\nThanks.'''
|
||||
>>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
|
||||
This particular tokenizer requires the Punkt sentence tokenization
|
||||
models to be installed. NLTK also provides a simpler,
|
||||
regular-expression based tokenizer, which splits text on whitespace
|
||||
and punctuation:
|
||||
|
||||
>>> from nltk.tokenize import wordpunct_tokenize
|
||||
>>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
|
||||
We can also operate at the level of sentences, using the sentence
|
||||
tokenizer directly as follows:
|
||||
|
||||
>>> from nltk.tokenize import sent_tokenize, word_tokenize
|
||||
>>> sent_tokenize(s)
|
||||
['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
|
||||
>>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
|
||||
[['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
|
||||
['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]
|
||||
|
||||
Caution: when tokenizing a Unicode string, make sure you are not
|
||||
using an encoded version of the string (it may be necessary to
|
||||
decode it first, e.g. with ``s.decode("utf8")``.
|
||||
|
||||
NLTK tokenizers can produce token-spans, represented as tuples of integers
|
||||
having the same semantics as string slices, to support efficient comparison
|
||||
of tokenizers. (These methods are implemented as generators.)
|
||||
|
||||
>>> from nltk.tokenize import WhitespaceTokenizer
|
||||
>>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
|
||||
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
|
||||
(45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
|
||||
|
||||
There are numerous ways to tokenize text. If you need more control over
|
||||
tokenization, see the other methods provided in this package.
|
||||
|
||||
For further information, please see Chapter 3 of the NLTK book.
|
||||
"""
|
||||
|
||||
import functools
|
||||
import re
|
||||
|
||||
from nltk.data import load
|
||||
from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
|
||||
from nltk.tokenize.destructive import NLTKWordTokenizer
|
||||
from nltk.tokenize.legality_principle import LegalitySyllableTokenizer
|
||||
from nltk.tokenize.mwe import MWETokenizer
|
||||
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTokenizer
|
||||
from nltk.tokenize.regexp import (
|
||||
BlanklineTokenizer,
|
||||
RegexpTokenizer,
|
||||
WhitespaceTokenizer,
|
||||
WordPunctTokenizer,
|
||||
blankline_tokenize,
|
||||
regexp_tokenize,
|
||||
wordpunct_tokenize,
|
||||
)
|
||||
from nltk.tokenize.repp import ReppTokenizer
|
||||
from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
|
||||
from nltk.tokenize.simple import (
|
||||
LineTokenizer,
|
||||
SpaceTokenizer,
|
||||
TabTokenizer,
|
||||
line_tokenize,
|
||||
)
|
||||
from nltk.tokenize.sonority_sequencing import SyllableTokenizer
|
||||
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
|
||||
from nltk.tokenize.texttiling import TextTilingTokenizer
|
||||
from nltk.tokenize.toktok import ToktokTokenizer
|
||||
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
|
||||
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def _get_punkt_tokenizer(language="english"):
|
||||
"""
|
||||
A constructor for the PunktTokenizer that utilizes
|
||||
a lru cache for performance.
|
||||
|
||||
:param language: the model name in the Punkt corpus
|
||||
:type language: str
|
||||
"""
|
||||
return PunktTokenizer(language)
|
||||
|
||||
|
||||
# Standard sentence tokenizer.
|
||||
def sent_tokenize(text, language="english"):
|
||||
"""
|
||||
Return a sentence-tokenized copy of *text*,
|
||||
using NLTK's recommended sentence tokenizer
|
||||
(currently :class:`.PunktSentenceTokenizer`
|
||||
for the specified language).
|
||||
|
||||
:param text: text to split into sentences
|
||||
:param language: the model name in the Punkt corpus
|
||||
"""
|
||||
tokenizer = _get_punkt_tokenizer(language)
|
||||
return tokenizer.tokenize(text)
|
||||
|
||||
|
||||
# Standard word tokenizer.
|
||||
_treebank_word_tokenizer = NLTKWordTokenizer()
|
||||
|
||||
|
||||
def word_tokenize(text, language="english", preserve_line=False):
|
||||
"""
|
||||
Return a tokenized copy of *text*,
|
||||
using NLTK's recommended word tokenizer
|
||||
(currently an improved :class:`.TreebankWordTokenizer`
|
||||
along with :class:`.PunktSentenceTokenizer`
|
||||
for the specified language).
|
||||
|
||||
:param text: text to split into words
|
||||
:type text: str
|
||||
:param language: the model name in the Punkt corpus
|
||||
:type language: str
|
||||
:param preserve_line: A flag to decide whether to sentence tokenize the text or not.
|
||||
:type preserve_line: bool
|
||||
"""
|
||||
sentences = [text] if preserve_line else sent_tokenize(text, language)
|
||||
return [
|
||||
token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
|
||||
]
|
||||
83
backend/venv/Lib/site-packages/nltk/tokenize/api.py
Normal file
83
backend/venv/Lib/site-packages/nltk/tokenize/api.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# Natural Language Toolkit: Tokenizer Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Tokenizer Interface
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Iterator, List, Tuple
|
||||
|
||||
from nltk.internals import overridden
|
||||
from nltk.tokenize.util import string_span_tokenize
|
||||
|
||||
|
||||
class TokenizerI(ABC):
|
||||
"""
|
||||
A processing interface for tokenizing a string.
|
||||
Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def tokenize(self, s: str) -> List[str]:
|
||||
"""
|
||||
Return a tokenized copy of *s*.
|
||||
|
||||
:rtype: List[str]
|
||||
"""
|
||||
if overridden(self.tokenize_sents):
|
||||
return self.tokenize_sents([s])[0]
|
||||
|
||||
def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
|
||||
"""
|
||||
Identify the tokens using integer offsets ``(start_i, end_i)``,
|
||||
where ``s[start_i:end_i]`` is the corresponding token.
|
||||
|
||||
:rtype: Iterator[Tuple[int, int]]
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
|
||||
"""
|
||||
Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
|
||||
|
||||
return [self.tokenize(s) for s in strings]
|
||||
|
||||
:rtype: List[List[str]]
|
||||
"""
|
||||
return [self.tokenize(s) for s in strings]
|
||||
|
||||
def span_tokenize_sents(
|
||||
self, strings: List[str]
|
||||
) -> Iterator[List[Tuple[int, int]]]:
|
||||
"""
|
||||
Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
|
||||
|
||||
return [self.span_tokenize(s) for s in strings]
|
||||
|
||||
:yield: List[Tuple[int, int]]
|
||||
"""
|
||||
for s in strings:
|
||||
yield list(self.span_tokenize(s))
|
||||
|
||||
|
||||
class StringTokenizer(TokenizerI):
|
||||
"""A tokenizer that divides a string into substrings by splitting
|
||||
on the specified string (defined in subclasses).
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def _string(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def tokenize(self, s):
|
||||
return s.split(self._string)
|
||||
|
||||
def span_tokenize(self, s):
|
||||
yield from string_span_tokenize(s, self._string)
|
||||
458
backend/venv/Lib/site-packages/nltk/tokenize/casual.py
Normal file
458
backend/venv/Lib/site-packages/nltk/tokenize/casual.py
Normal file
@@ -0,0 +1,458 @@
|
||||
#
|
||||
# Natural Language Toolkit: Twitter Tokenizer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Christopher Potts <cgpotts@stanford.edu>
|
||||
# Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
|
||||
# Pierpaolo Pantone <> (modifications)
|
||||
# Tom Aarsen <> (modifications)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
|
||||
"""
|
||||
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
|
||||
domains and tasks. The basic logic is this:
|
||||
|
||||
1. The tuple REGEXPS defines a list of regular expression
|
||||
strings.
|
||||
|
||||
2. The REGEXPS strings are put, in order, into a compiled
|
||||
regular expression object called WORD_RE, under the TweetTokenizer
|
||||
class.
|
||||
|
||||
3. The tokenization is done by WORD_RE.findall(s), where s is the
|
||||
user-supplied string, inside the tokenize() method of the class
|
||||
TweetTokenizer.
|
||||
|
||||
4. When instantiating Tokenizer objects, there are several options:
|
||||
* preserve_case. By default, it is set to True. If it is set to
|
||||
False, then the tokenizer will downcase everything except for
|
||||
emoticons.
|
||||
* reduce_len. By default, it is set to False. It specifies whether
|
||||
to replace repeated character sequences of length 3 or greater
|
||||
with sequences of length 3.
|
||||
* strip_handles. By default, it is set to False. It specifies
|
||||
whether to remove Twitter handles of text used in the
|
||||
`tokenize` method.
|
||||
* match_phone_numbers. By default, it is set to True. It indicates
|
||||
whether the `tokenize` method should look for phone numbers.
|
||||
"""
|
||||
|
||||
|
||||
######################################################################
|
||||
|
||||
import html
|
||||
from typing import List
|
||||
|
||||
import regex # https://github.com/nltk/nltk/issues/2409
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
######################################################################
|
||||
# The following strings are components in the regular expression
|
||||
# that is used for tokenizing. It's important that phone_number
|
||||
# appears first in the final regex (since it can contain whitespace).
|
||||
# It also could matter that tags comes after emoticons, due to the
|
||||
# possibility of having text like
|
||||
#
|
||||
# <:| and some text >:)
|
||||
#
|
||||
# Most importantly, the final element should always be last, since it
|
||||
# does a last ditch whitespace-based tokenization of whatever is left.
|
||||
|
||||
# ToDo: Update with https://en.wikipedia.org/wiki/List_of_emoticons ?
|
||||
|
||||
# This particular element is used in a couple ways, so we define it
|
||||
# with a name:
|
||||
EMOTICONS = r"""
|
||||
(?:
|
||||
[<>]?
|
||||
[:;=8] # eyes
|
||||
[\-o\*\']? # optional nose
|
||||
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
||||
|
|
||||
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
||||
[\-o\*\']? # optional nose
|
||||
[:;=8] # eyes
|
||||
[<>]?
|
||||
|
|
||||
</?3 # heart
|
||||
)"""
|
||||
|
||||
# URL pattern due to John Gruber, modified by Tom Winzig. See
|
||||
# https://gist.github.com/winzig/8894715
|
||||
|
||||
URLS = r""" # Capture 1: entire matched URL
|
||||
(?:
|
||||
https?: # URL protocol and colon
|
||||
(?:
|
||||
/{1,3} # 1-3 slashes
|
||||
| # or
|
||||
[a-z0-9%] # Single letter or digit or '%'
|
||||
# (Trying not to match e.g. "URI::Escape")
|
||||
)
|
||||
| # or
|
||||
# looks like domain name followed by a slash:
|
||||
[a-z0-9.\-]+[.]
|
||||
(?:[a-z]{2,13})
|
||||
/
|
||||
)
|
||||
(?: # One or more:
|
||||
[^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
|
||||
| # or
|
||||
\([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
|
||||
|
|
||||
\([^\s]+?\) # balanced parens, non-recursive: (...)
|
||||
)+
|
||||
(?: # End with:
|
||||
\([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
|
||||
|
|
||||
\([^\s]+?\) # balanced parens, non-recursive: (...)
|
||||
| # or
|
||||
[^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
|
||||
)
|
||||
| # OR, the following to match naked domains:
|
||||
(?:
|
||||
(?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
|
||||
[a-z0-9]+
|
||||
(?:[.\-][a-z0-9]+)*
|
||||
[.]
|
||||
(?:[a-z]{2,13})
|
||||
\b
|
||||
/?
|
||||
(?!@) # not succeeded by a @,
|
||||
# avoid matching "foo.na" in "foo.na@example.com"
|
||||
)
|
||||
"""
|
||||
|
||||
# emoji flag sequence
|
||||
# https://en.wikipedia.org/wiki/Regional_indicator_symbol
|
||||
# For regex simplicity, include all possible enclosed letter pairs,
|
||||
# not the ISO subset of two-letter regional indicator symbols.
|
||||
# See https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Current_codes
|
||||
# Future regional flag support may be handled with the regex for
|
||||
# U+1F3F4 🏴 followed by emoji tag sequences:
|
||||
# r'\U0001F3F4[\U000E0000-\U000E007E]{5}\U000E007F'
|
||||
FLAGS = r"""
|
||||
(?:
|
||||
[\U0001F1E6-\U0001F1FF]{2} # all enclosed letter pairs
|
||||
|
|
||||
# English flag
|
||||
\U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006e\U000E0067\U000E007F
|
||||
|
|
||||
# Scottish flag
|
||||
\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F
|
||||
|
|
||||
# For Wales? Why Richard, it profit a man nothing to give his soul for the whole world … but for Wales!
|
||||
\U0001F3F4\U000E0067\U000E0062\U000E0077\U000E006C\U000E0073\U000E007F
|
||||
)
|
||||
"""
|
||||
|
||||
# Regex for recognizing phone numbers:
|
||||
PHONE_REGEX = r"""
|
||||
(?:
|
||||
(?: # (international)
|
||||
\+?[01]
|
||||
[ *\-.\)]*
|
||||
)?
|
||||
(?: # (area code)
|
||||
[\(]?
|
||||
\d{3}
|
||||
[ *\-.\)]*
|
||||
)?
|
||||
\d{3} # exchange
|
||||
[ *\-.\)]*
|
||||
\d{4} # base
|
||||
)"""
|
||||
|
||||
# The components of the tokenizer:
|
||||
REGEXPS = (
|
||||
URLS,
|
||||
# ASCII Emoticons
|
||||
EMOTICONS,
|
||||
# HTML tags:
|
||||
r"""<[^>\s]+>""",
|
||||
# ASCII Arrows
|
||||
r"""[\-]+>|<[\-]+""",
|
||||
# Twitter username:
|
||||
r"""(?:@[\w_]+)""",
|
||||
# Twitter hashtags:
|
||||
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
|
||||
# email addresses
|
||||
r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
|
||||
# Zero-Width-Joiner and Skin tone modifier emojis
|
||||
""".(?:
|
||||
[\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+
|
||||
|
|
||||
[\U0001F3FB-\U0001F3FF]
|
||||
)""",
|
||||
# flags
|
||||
FLAGS,
|
||||
# Remaining word types:
|
||||
r"""
|
||||
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
|
||||
|
|
||||
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
||||
|
|
||||
(?:[\w_]+) # Words without apostrophes or dashes.
|
||||
|
|
||||
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
||||
|
|
||||
(?:\S) # Everything else that isn't whitespace.
|
||||
""",
|
||||
)
|
||||
|
||||
# Take the main components and add a phone regex as the second parameter
|
||||
REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:])
|
||||
|
||||
######################################################################
|
||||
# TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent
|
||||
# the core tokenizing regexes. They are compiled lazily.
|
||||
|
||||
# WORD_RE performs poorly on these patterns:
|
||||
HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
|
||||
|
||||
# The emoticon string gets its own regex so that we can preserve case for
|
||||
# them as needed:
|
||||
EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
|
||||
|
||||
# These are for regularizing HTML entities to Unicode:
|
||||
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
|
||||
|
||||
# For stripping away handles from a tweet:
|
||||
HANDLES_RE = regex.compile(
|
||||
r"(?<![A-Za-z0-9_!@#\$%&*])@"
|
||||
r"(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))"
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# Functions for converting html entities
|
||||
######################################################################
|
||||
|
||||
|
||||
def _str_to_unicode(text, encoding=None, errors="strict"):
|
||||
if encoding is None:
|
||||
encoding = "utf-8"
|
||||
if isinstance(text, bytes):
|
||||
return text.decode(encoding, errors)
|
||||
return text
|
||||
|
||||
|
||||
def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
|
||||
"""
|
||||
Remove entities from text by converting them to their
|
||||
corresponding unicode character.
|
||||
|
||||
:param text: a unicode string or a byte string encoded in the given
|
||||
`encoding` (which defaults to 'utf-8').
|
||||
|
||||
:param list keep: list of entity names which should not be replaced.\
|
||||
This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
|
||||
and named entities (such as `` `` or ``>``).
|
||||
|
||||
:param bool remove_illegal: If `True`, entities that can't be converted are\
|
||||
removed. Otherwise, entities that can't be converted are kept "as
|
||||
is".
|
||||
|
||||
:returns: A unicode string with the entities removed.
|
||||
|
||||
See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
|
||||
|
||||
>>> from nltk.tokenize.casual import _replace_html_entities
|
||||
>>> _replace_html_entities(b'Price: £100')
|
||||
'Price: \\xa3100'
|
||||
>>> print(_replace_html_entities(b'Price: £100'))
|
||||
Price: £100
|
||||
>>>
|
||||
"""
|
||||
|
||||
def _convert_entity(match):
|
||||
entity_body = match.group(3)
|
||||
if match.group(1):
|
||||
try:
|
||||
if match.group(2):
|
||||
number = int(entity_body, 16)
|
||||
else:
|
||||
number = int(entity_body, 10)
|
||||
# Numeric character references in the 80-9F range are typically
|
||||
# interpreted by browsers as representing the characters mapped
|
||||
# to bytes 80-9F in the Windows-1252 encoding. For more info
|
||||
# see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
|
||||
if 0x80 <= number <= 0x9F:
|
||||
return bytes((number,)).decode("cp1252")
|
||||
except ValueError:
|
||||
number = None
|
||||
else:
|
||||
if entity_body in keep:
|
||||
return match.group(0)
|
||||
number = html.entities.name2codepoint.get(entity_body)
|
||||
if number is not None:
|
||||
try:
|
||||
return chr(number)
|
||||
except (ValueError, OverflowError):
|
||||
pass
|
||||
|
||||
return "" if remove_illegal else match.group(0)
|
||||
|
||||
return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
|
||||
|
||||
|
||||
######################################################################
|
||||
|
||||
|
||||
class TweetTokenizer(TokenizerI):
|
||||
r"""
|
||||
Tokenizer for tweets.
|
||||
|
||||
>>> from nltk.tokenize import TweetTokenizer
|
||||
>>> tknzr = TweetTokenizer()
|
||||
>>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
|
||||
>>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE
|
||||
['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->',
|
||||
'<--']
|
||||
|
||||
Examples using `strip_handles` and `reduce_len parameters`:
|
||||
|
||||
>>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
|
||||
>>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
|
||||
>>> tknzr.tokenize(s1)
|
||||
[':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
|
||||
"""
|
||||
|
||||
# Values used to lazily compile WORD_RE and PHONE_WORD_RE,
|
||||
# which are the core tokenizing regexes.
|
||||
_WORD_RE = None
|
||||
_PHONE_WORD_RE = None
|
||||
|
||||
######################################################################
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
preserve_case=True,
|
||||
reduce_len=False,
|
||||
strip_handles=False,
|
||||
match_phone_numbers=True,
|
||||
):
|
||||
"""
|
||||
Create a `TweetTokenizer` instance with settings for use in the `tokenize` method.
|
||||
|
||||
:param preserve_case: Flag indicating whether to preserve the casing (capitalisation)
|
||||
of text used in the `tokenize` method. Defaults to True.
|
||||
:type preserve_case: bool
|
||||
:param reduce_len: Flag indicating whether to replace repeated character sequences
|
||||
of length 3 or greater with sequences of length 3. Defaults to False.
|
||||
:type reduce_len: bool
|
||||
:param strip_handles: Flag indicating whether to remove Twitter handles of text used
|
||||
in the `tokenize` method. Defaults to False.
|
||||
:type strip_handles: bool
|
||||
:param match_phone_numbers: Flag indicating whether the `tokenize` method should look
|
||||
for phone numbers. Defaults to True.
|
||||
:type match_phone_numbers: bool
|
||||
"""
|
||||
self.preserve_case = preserve_case
|
||||
self.reduce_len = reduce_len
|
||||
self.strip_handles = strip_handles
|
||||
self.match_phone_numbers = match_phone_numbers
|
||||
|
||||
def tokenize(self, text: str) -> List[str]:
|
||||
"""Tokenize the input text.
|
||||
|
||||
:param text: str
|
||||
:rtype: list(str)
|
||||
:return: a tokenized list of strings; joining this list returns\
|
||||
the original string if `preserve_case=False`.
|
||||
"""
|
||||
# Fix HTML character entities:
|
||||
text = _replace_html_entities(text)
|
||||
# Remove username handles
|
||||
if self.strip_handles:
|
||||
text = remove_handles(text)
|
||||
# Normalize word lengthening
|
||||
if self.reduce_len:
|
||||
text = reduce_lengthening(text)
|
||||
# Shorten problematic sequences of characters
|
||||
safe_text = HANG_RE.sub(r"\1\1\1", text)
|
||||
# Recognise phone numbers during tokenization
|
||||
if self.match_phone_numbers:
|
||||
words = self.PHONE_WORD_RE.findall(safe_text)
|
||||
else:
|
||||
words = self.WORD_RE.findall(safe_text)
|
||||
# Possibly alter the case, but avoid changing emoticons like :D into :d:
|
||||
if not self.preserve_case:
|
||||
words = list(
|
||||
map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
|
||||
)
|
||||
return words
|
||||
|
||||
@property
|
||||
def WORD_RE(self) -> "regex.Pattern":
|
||||
"""Core TweetTokenizer regex"""
|
||||
# Compiles the regex for this and all future instantiations of TweetTokenizer.
|
||||
if not type(self)._WORD_RE:
|
||||
type(self)._WORD_RE = regex.compile(
|
||||
f"({'|'.join(REGEXPS)})",
|
||||
regex.VERBOSE | regex.I | regex.UNICODE,
|
||||
)
|
||||
return type(self)._WORD_RE
|
||||
|
||||
@property
|
||||
def PHONE_WORD_RE(self) -> "regex.Pattern":
|
||||
"""Secondary core TweetTokenizer regex"""
|
||||
# Compiles the regex for this and all future instantiations of TweetTokenizer.
|
||||
if not type(self)._PHONE_WORD_RE:
|
||||
type(self)._PHONE_WORD_RE = regex.compile(
|
||||
f"({'|'.join(REGEXPS_PHONE)})",
|
||||
regex.VERBOSE | regex.I | regex.UNICODE,
|
||||
)
|
||||
return type(self)._PHONE_WORD_RE
|
||||
|
||||
|
||||
######################################################################
|
||||
# Normalization Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
def reduce_lengthening(text):
|
||||
"""
|
||||
Replace repeated character sequences of length 3 or greater with sequences
|
||||
of length 3.
|
||||
"""
|
||||
pattern = regex.compile(r"(.)\1{2,}")
|
||||
return pattern.sub(r"\1\1\1", text)
|
||||
|
||||
|
||||
def remove_handles(text):
|
||||
"""
|
||||
Remove Twitter username handles from text.
|
||||
"""
|
||||
# Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
|
||||
return HANDLES_RE.sub(" ", text)
|
||||
|
||||
|
||||
######################################################################
|
||||
# Tokenization Function
|
||||
######################################################################
|
||||
|
||||
|
||||
def casual_tokenize(
|
||||
text,
|
||||
preserve_case=True,
|
||||
reduce_len=False,
|
||||
strip_handles=False,
|
||||
match_phone_numbers=True,
|
||||
):
|
||||
"""
|
||||
Convenience function for wrapping the tokenizer.
|
||||
"""
|
||||
return TweetTokenizer(
|
||||
preserve_case=preserve_case,
|
||||
reduce_len=reduce_len,
|
||||
strip_handles=strip_handles,
|
||||
match_phone_numbers=match_phone_numbers,
|
||||
).tokenize(text)
|
||||
|
||||
|
||||
###############################################################################
|
||||
234
backend/venv/Lib/site-packages/nltk/tokenize/destructive.py
Normal file
234
backend/venv/Lib/site-packages/nltk/tokenize/destructive.py
Normal file
@@ -0,0 +1,234 @@
|
||||
# Natural Language Toolkit: NLTK's very own tokenizer.
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Liling Tan
|
||||
# Tom Aarsen <> (modifications)
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from typing import Iterator, List, Tuple
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.tokenize.util import align_tokens
|
||||
|
||||
|
||||
class MacIntyreContractions:
|
||||
"""
|
||||
List of contractions adapted from Robert MacIntyre's tokenizer.
|
||||
"""
|
||||
|
||||
CONTRACTIONS2 = [
|
||||
r"(?i)\b(can)(?#X)(not)\b",
|
||||
r"(?i)\b(d)(?#X)('ye)\b",
|
||||
r"(?i)\b(gim)(?#X)(me)\b",
|
||||
r"(?i)\b(gon)(?#X)(na)\b",
|
||||
r"(?i)\b(got)(?#X)(ta)\b",
|
||||
r"(?i)\b(lem)(?#X)(me)\b",
|
||||
r"(?i)\b(more)(?#X)('n)\b",
|
||||
r"(?i)\b(wan)(?#X)(na)(?=\s)",
|
||||
]
|
||||
CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
|
||||
CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
|
||||
|
||||
|
||||
class NLTKWordTokenizer(TokenizerI):
|
||||
"""
|
||||
The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
|
||||
|
||||
This is the method that is invoked by ``word_tokenize()``. It assumes that the
|
||||
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
|
||||
|
||||
The tokenizer is "destructive" such that the regexes applied will munge the
|
||||
input string to a state beyond re-construction. It is possible to apply
|
||||
`TreebankWordDetokenizer.detokenize` to the tokenized outputs of
|
||||
`NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
|
||||
revert to the original string.
|
||||
"""
|
||||
|
||||
# Starting quotes.
|
||||
STARTING_QUOTES = [
|
||||
(re.compile("([«“‘„]|[`]+)", re.U), r" \1 "),
|
||||
(re.compile(r"^\""), r"``"),
|
||||
(re.compile(r"(``)"), r" \1 "),
|
||||
(re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
|
||||
(re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"),
|
||||
]
|
||||
|
||||
# Ending quotes.
|
||||
ENDING_QUOTES = [
|
||||
(re.compile("([»”’])", re.U), r" \1 "),
|
||||
(re.compile(r"''"), " '' "),
|
||||
(re.compile(r'"'), " '' "),
|
||||
(re.compile(r"\s+"), " "),
|
||||
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
|
||||
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
|
||||
]
|
||||
|
||||
# For improvements for starting/closing quotes from TreebankWordTokenizer,
|
||||
# see discussion on https://github.com/nltk/nltk/pull/1437
|
||||
# Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
|
||||
# - chevron quotes u'\xab' and u'\xbb'
|
||||
# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
|
||||
# See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
|
||||
# Also, behavior of splitting on clitics now follows Stanford CoreNLP
|
||||
# - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
|
||||
|
||||
# Punctuation.
|
||||
PUNCTUATION = [
|
||||
(re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
|
||||
(re.compile(r"([:,])([^\d])"), r" \1 \2"),
|
||||
(re.compile(r"([:,])$"), r" \1 "),
|
||||
(
|
||||
re.compile(r"\.{2,}", re.U),
|
||||
r" \g<0> ",
|
||||
), # See https://github.com/nltk/nltk/pull/2322
|
||||
(re.compile(r"[;@#$%&]"), r" \g<0> "),
|
||||
(
|
||||
re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
|
||||
r"\1 \2\3 ",
|
||||
), # Handles the final period.
|
||||
(re.compile(r"[?!]"), r" \g<0> "),
|
||||
(re.compile(r"([^'])' "), r"\1 ' "),
|
||||
(
|
||||
re.compile(r"[*]", re.U),
|
||||
r" \g<0> ",
|
||||
), # See https://github.com/nltk/nltk/pull/2322
|
||||
]
|
||||
|
||||
# Pads parentheses
|
||||
PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
|
||||
|
||||
# Optionally: Convert parentheses, brackets and converts them to PTB symbols.
|
||||
CONVERT_PARENTHESES = [
|
||||
(re.compile(r"\("), "-LRB-"),
|
||||
(re.compile(r"\)"), "-RRB-"),
|
||||
(re.compile(r"\["), "-LSB-"),
|
||||
(re.compile(r"\]"), "-RSB-"),
|
||||
(re.compile(r"\{"), "-LCB-"),
|
||||
(re.compile(r"\}"), "-RCB-"),
|
||||
]
|
||||
|
||||
DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
|
||||
|
||||
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
||||
_contractions = MacIntyreContractions()
|
||||
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
|
||||
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
|
||||
|
||||
def tokenize(
|
||||
self, text: str, convert_parentheses: bool = False, return_str: bool = False
|
||||
) -> List[str]:
|
||||
r"""Return a tokenized copy of `text`.
|
||||
|
||||
>>> from nltk.tokenize import NLTKWordTokenizer
|
||||
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
|
||||
>>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
|
||||
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
||||
'of', 'them.', 'Thanks', '.']
|
||||
>>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
|
||||
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
||||
'of', 'them.', 'Thanks', '.']
|
||||
|
||||
|
||||
:param text: A string with a sentence or sentences.
|
||||
:type text: str
|
||||
:param convert_parentheses: if True, replace parentheses to PTB symbols,
|
||||
e.g. `(` to `-LRB-`. Defaults to False.
|
||||
:type convert_parentheses: bool, optional
|
||||
:param return_str: If True, return tokens as space-separated string,
|
||||
defaults to False.
|
||||
:type return_str: bool, optional
|
||||
:return: List of tokens from `text`.
|
||||
:rtype: List[str]
|
||||
"""
|
||||
if return_str:
|
||||
warnings.warn(
|
||||
"Parameter 'return_str' has been deprecated and should no "
|
||||
"longer be used.",
|
||||
category=DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
for regexp, substitution in self.STARTING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
for regexp, substitution in self.PUNCTUATION:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Handles parentheses.
|
||||
regexp, substitution = self.PARENS_BRACKETS
|
||||
text = regexp.sub(substitution, text)
|
||||
# Optionally convert parentheses
|
||||
if convert_parentheses:
|
||||
for regexp, substitution in self.CONVERT_PARENTHESES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Handles double dash.
|
||||
regexp, substitution = self.DOUBLE_DASHES
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# add extra space to make things easier
|
||||
text = " " + text + " "
|
||||
|
||||
for regexp, substitution in self.ENDING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
for regexp in self.CONTRACTIONS2:
|
||||
text = regexp.sub(r" \1 \2 ", text)
|
||||
for regexp in self.CONTRACTIONS3:
|
||||
text = regexp.sub(r" \1 \2 ", text)
|
||||
|
||||
# We are not using CONTRACTIONS4 since
|
||||
# they are also commented out in the SED scripts
|
||||
# for regexp in self._contractions.CONTRACTIONS4:
|
||||
# text = regexp.sub(r' \1 \2 \3 ', text)
|
||||
|
||||
return text.split()
|
||||
|
||||
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
|
||||
r"""
|
||||
Returns the spans of the tokens in ``text``.
|
||||
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
|
||||
|
||||
>>> from nltk.tokenize import NLTKWordTokenizer
|
||||
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
|
||||
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
|
||||
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
|
||||
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
|
||||
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
|
||||
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
|
||||
True
|
||||
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
|
||||
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
|
||||
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
|
||||
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
|
||||
True
|
||||
|
||||
:param text: A string with a sentence or sentences.
|
||||
:type text: str
|
||||
:yield: Tuple[int, int]
|
||||
"""
|
||||
raw_tokens = self.tokenize(text)
|
||||
|
||||
# Convert converted quotes back to original double quotes
|
||||
# Do this only if original text contains double quote(s) or double
|
||||
# single-quotes (because '' might be transformed to `` if it is
|
||||
# treated as starting quotes).
|
||||
if ('"' in text) or ("''" in text):
|
||||
# Find double quotes and converted quotes
|
||||
matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
|
||||
|
||||
# Replace converted quotes back to double quotes
|
||||
tokens = [
|
||||
matched.pop(0) if tok in ['"', "``", "''"] else tok
|
||||
for tok in raw_tokens
|
||||
]
|
||||
else:
|
||||
tokens = raw_tokens
|
||||
|
||||
yield from align_tokens(tokens, text)
|
||||
@@ -0,0 +1,147 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Christopher Hench <chris.l.hench@gmail.com>
|
||||
# Alex Estes
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The Legality Principle is a language agnostic principle maintaining that syllable
|
||||
onsets and codas (the beginning and ends of syllables not including the vowel)
|
||||
are only legal if they are found as word onsets or codas in the language. The English
|
||||
word ''admit'' must then be syllabified as ''ad-mit'' since ''dm'' is not found
|
||||
word-initially in the English language (Bartlett et al.). This principle was first proposed
|
||||
in Daniel Kahn's 1976 dissertation, ''Syllable-based generalizations in English phonology''.
|
||||
|
||||
Kahn further argues that there is a ''strong tendency to syllabify in such a way that
|
||||
initial clusters are of maximal length, consistent with the general constraints on
|
||||
word-initial consonant clusters.'' Consequently, in addition to being legal onsets,
|
||||
the longest legal onset is preferable---''Onset Maximization''.
|
||||
|
||||
The default implementation assumes an English vowel set, but the `vowels` attribute
|
||||
can be set to IPA or any other alphabet's vowel set for the use-case.
|
||||
Both a valid set of vowels as well as a text corpus of words in the language
|
||||
are necessary to determine legal onsets and subsequently syllabify words.
|
||||
|
||||
The legality principle with onset maximization is a universal syllabification algorithm,
|
||||
but that does not mean it performs equally across languages. Bartlett et al. (2009)
|
||||
is a good benchmark for English accuracy if utilizing IPA (pg. 311).
|
||||
|
||||
References:
|
||||
|
||||
- Otto Jespersen. 1904. Lehrbuch der Phonetik.
|
||||
Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
|
||||
- Theo Vennemann, ''On the Theory of Syllabic Phonology,'' 1972, p. 11.
|
||||
- Daniel Kahn, ''Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976).
|
||||
- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
|
||||
In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
|
||||
Cambridge, MIT Press. pp. 107-136.
|
||||
- Jeremy Goslin and Ulrich Frauenfelder. 2001. A comparison of theoretical and human syllabification. Language and Speech, 44:409–436.
|
||||
- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
|
||||
In HLT-NAACL. pp. 308-316.
|
||||
- Christopher Hench. 2017. Resonances in Middle High German: New Methodologies in Prosody. UC Berkeley.
|
||||
"""
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
|
||||
class LegalitySyllableTokenizer(TokenizerI):
|
||||
"""
|
||||
Syllabifies words based on the Legality Principle and Onset Maximization.
|
||||
|
||||
>>> from nltk.tokenize import LegalitySyllableTokenizer
|
||||
>>> from nltk import word_tokenize
|
||||
>>> from nltk.corpus import words
|
||||
>>> text = "This is a wonderful sentence."
|
||||
>>> text_words = word_tokenize(text)
|
||||
>>> LP = LegalitySyllableTokenizer(words.words())
|
||||
>>> [LP.tokenize(word) for word in text_words]
|
||||
[['This'], ['is'], ['a'], ['won', 'der', 'ful'], ['sen', 'ten', 'ce'], ['.']]
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, tokenized_source_text, vowels="aeiouy", legal_frequency_threshold=0.001
|
||||
):
|
||||
"""
|
||||
:param tokenized_source_text: List of valid tokens in the language
|
||||
:type tokenized_source_text: list(str)
|
||||
:param vowels: Valid vowels in language or IPA representation
|
||||
:type vowels: str
|
||||
:param legal_frequency_threshold: Lowest frequency of all onsets to be considered a legal onset
|
||||
:type legal_frequency_threshold: float
|
||||
"""
|
||||
self.legal_frequency_threshold = legal_frequency_threshold
|
||||
self.vowels = vowels
|
||||
self.legal_onsets = self.find_legal_onsets(tokenized_source_text)
|
||||
|
||||
def find_legal_onsets(self, words):
|
||||
"""
|
||||
Gathers all onsets and then return only those above the frequency threshold
|
||||
|
||||
:param words: List of words in a language
|
||||
:type words: list(str)
|
||||
:return: Set of legal onsets
|
||||
:rtype: set(str)
|
||||
"""
|
||||
onsets = [self.onset(word) for word in words]
|
||||
legal_onsets = [
|
||||
k
|
||||
for k, v in Counter(onsets).items()
|
||||
if (v / len(onsets)) > self.legal_frequency_threshold
|
||||
]
|
||||
return set(legal_onsets)
|
||||
|
||||
def onset(self, word):
|
||||
"""
|
||||
Returns consonant cluster of word, i.e. all characters until the first vowel.
|
||||
|
||||
:param word: Single word or token
|
||||
:type word: str
|
||||
:return: String of characters of onset
|
||||
:rtype: str
|
||||
"""
|
||||
onset = ""
|
||||
for c in word.lower():
|
||||
if c in self.vowels:
|
||||
return onset
|
||||
else:
|
||||
onset += c
|
||||
return onset
|
||||
|
||||
def tokenize(self, token):
|
||||
"""
|
||||
Apply the Legality Principle in combination with
|
||||
Onset Maximization to return a list of syllables.
|
||||
|
||||
:param token: Single word or token
|
||||
:type token: str
|
||||
:return syllable_list: Single word or token broken up into syllables.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
syllables = []
|
||||
syllable, current_onset = "", ""
|
||||
vowel, onset = False, False
|
||||
for char in token[::-1]:
|
||||
char_lower = char.lower()
|
||||
if not vowel:
|
||||
syllable += char
|
||||
vowel = bool(char_lower in self.vowels)
|
||||
else:
|
||||
if char_lower + current_onset[::-1] in self.legal_onsets:
|
||||
syllable += char
|
||||
current_onset += char_lower
|
||||
onset = True
|
||||
elif char_lower in self.vowels and not onset:
|
||||
syllable += char
|
||||
current_onset += char_lower
|
||||
else:
|
||||
syllables.append(syllable)
|
||||
syllable = char
|
||||
current_onset = ""
|
||||
vowel = bool(char_lower in self.vowels)
|
||||
syllables.append(syllable)
|
||||
syllables_ordered = [syllable[::-1] for syllable in syllables][::-1]
|
||||
return syllables_ordered
|
||||
124
backend/venv/Lib/site-packages/nltk/tokenize/mwe.py
Normal file
124
backend/venv/Lib/site-packages/nltk/tokenize/mwe.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# Multi-Word Expression tokenizer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Rob Malouf <rmalouf@mail.sdsu.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Multi-Word Expression Tokenizer
|
||||
|
||||
A ``MWETokenizer`` takes a string which has already been divided into tokens and
|
||||
retokenizes it, merging multi-word expressions into single tokens, using a lexicon
|
||||
of MWEs:
|
||||
|
||||
|
||||
>>> from nltk.tokenize import MWETokenizer
|
||||
|
||||
>>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
|
||||
>>> tokenizer.add_mwe(('in', 'spite', 'of'))
|
||||
|
||||
>>> tokenizer.tokenize('Testing testing testing one two three'.split())
|
||||
['Testing', 'testing', 'testing', 'one', 'two', 'three']
|
||||
|
||||
>>> tokenizer.tokenize('This is a test in spite'.split())
|
||||
['This', 'is', 'a', 'test', 'in', 'spite']
|
||||
|
||||
>>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
|
||||
['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
|
||||
|
||||
"""
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.util import Trie
|
||||
|
||||
|
||||
class MWETokenizer(TokenizerI):
|
||||
"""A tokenizer that processes tokenized text and merges multi-word expressions
|
||||
into single tokens.
|
||||
"""
|
||||
|
||||
def __init__(self, mwes=None, separator="_"):
|
||||
"""Initialize the multi-word tokenizer with a list of expressions and a
|
||||
separator
|
||||
|
||||
:type mwes: list(list(str))
|
||||
:param mwes: A sequence of multi-word expressions to be merged, where
|
||||
each MWE is a sequence of strings.
|
||||
:type separator: str
|
||||
:param separator: String that should be inserted between words in a multi-word
|
||||
expression token. (Default is '_')
|
||||
|
||||
"""
|
||||
if not mwes:
|
||||
mwes = []
|
||||
self._mwes = Trie(mwes)
|
||||
self._separator = separator
|
||||
|
||||
def add_mwe(self, mwe):
|
||||
"""Add a multi-word expression to the lexicon (stored as a word trie)
|
||||
|
||||
We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.
|
||||
The key True marks the end of a valid MWE.
|
||||
|
||||
:param mwe: The multi-word expression we're adding into the word trie
|
||||
:type mwe: tuple(str) or list(str)
|
||||
|
||||
:Example:
|
||||
|
||||
>>> tokenizer = MWETokenizer()
|
||||
>>> tokenizer.add_mwe(('a', 'b'))
|
||||
>>> tokenizer.add_mwe(('a', 'b', 'c'))
|
||||
>>> tokenizer.add_mwe(('a', 'x'))
|
||||
>>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
|
||||
>>> tokenizer._mwes == expected
|
||||
True
|
||||
|
||||
"""
|
||||
self._mwes.insert(mwe)
|
||||
|
||||
def tokenize(self, text):
|
||||
"""
|
||||
|
||||
:param text: A list containing tokenized text
|
||||
:type text: list(str)
|
||||
:return: A list of the tokenized text with multi-words merged together
|
||||
:rtype: list(str)
|
||||
|
||||
:Example:
|
||||
|
||||
>>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
|
||||
>>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
|
||||
['An', "hors+d'oeuvre", 'tonight,', 'sir?']
|
||||
|
||||
"""
|
||||
i = 0
|
||||
n = len(text)
|
||||
result = []
|
||||
|
||||
while i < n:
|
||||
if text[i] in self._mwes:
|
||||
# possible MWE match
|
||||
j = i
|
||||
trie = self._mwes
|
||||
last_match = -1
|
||||
while j < n and text[j] in trie: # and len(trie[text[j]]) > 0 :
|
||||
trie = trie[text[j]]
|
||||
j = j + 1
|
||||
if Trie.LEAF in trie:
|
||||
last_match = j
|
||||
else:
|
||||
if last_match > -1:
|
||||
j = last_match
|
||||
|
||||
if Trie.LEAF in trie or last_match > -1:
|
||||
# success!
|
||||
result.append(self._separator.join(text[i:j]))
|
||||
i = j
|
||||
else:
|
||||
# no match, so backtrack
|
||||
result.append(text[i])
|
||||
i += 1
|
||||
else:
|
||||
result.append(text[i])
|
||||
i += 1
|
||||
return result
|
||||
179
backend/venv/Lib/site-packages/nltk/tokenize/nist.py
Normal file
179
backend/venv/Lib/site-packages/nltk/tokenize/nist.py
Normal file
@@ -0,0 +1,179 @@
|
||||
# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer.
|
||||
#
|
||||
# Copyright (C) 2001-2015 NLTK Project
|
||||
# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
|
||||
# Contributors: Ozan Caglayan, Wiktor Stribizew
|
||||
#
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
|
||||
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
|
||||
which was also ported into Python in
|
||||
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
|
||||
"""
|
||||
|
||||
|
||||
import io
|
||||
import re
|
||||
|
||||
from nltk.corpus import perluniprops
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.tokenize.util import xml_unescape
|
||||
|
||||
|
||||
class NISTTokenizer(TokenizerI):
|
||||
"""
|
||||
This NIST tokenizer is sentence-based instead of the original
|
||||
paragraph-based tokenization from mteval-14.pl; The sentence-based
|
||||
tokenization is consistent with the other tokenizers available in NLTK.
|
||||
|
||||
>>> from nltk.tokenize.nist import NISTTokenizer
|
||||
>>> nist = NISTTokenizer()
|
||||
>>> s = "Good muffins cost $3.88 in New York."
|
||||
>>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
|
||||
>>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
|
||||
>>> nist.tokenize(s, lowercase=False) == expected_cased
|
||||
True
|
||||
>>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased.
|
||||
True
|
||||
|
||||
The international_tokenize() is the preferred function when tokenizing
|
||||
non-european text, e.g.
|
||||
|
||||
>>> from nltk.tokenize.nist import NISTTokenizer
|
||||
>>> nist = NISTTokenizer()
|
||||
|
||||
# Input strings.
|
||||
>>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) is a Chinese e-commerce company...'
|
||||
>>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
|
||||
>>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'
|
||||
|
||||
# Expected tokens.
|
||||
>>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']
|
||||
>>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']
|
||||
>>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']
|
||||
|
||||
>>> nist.international_tokenize(albb)[:10] == expected_albb
|
||||
True
|
||||
>>> nist.international_tokenize(amz)[:10] == expected_amz
|
||||
True
|
||||
>>> nist.international_tokenize(rkt)[:10] == expected_rkt
|
||||
True
|
||||
|
||||
# Doctest for patching issue #1926
|
||||
>>> sent = u'this is a foo\u2604sentence.'
|
||||
>>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.']
|
||||
>>> nist.international_tokenize(sent) == expected_sent
|
||||
True
|
||||
"""
|
||||
|
||||
# Strip "skipped" tags
|
||||
STRIP_SKIP = re.compile("<skipped>"), ""
|
||||
# Strip end-of-line hyphenation and join lines
|
||||
STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
|
||||
# Tokenize punctuation.
|
||||
PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
|
||||
# Tokenize period and comma unless preceded by a digit.
|
||||
PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 "
|
||||
# Tokenize period and comma unless followed by a digit.
|
||||
PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2"
|
||||
# Tokenize dash when preceded by a digit
|
||||
DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "
|
||||
|
||||
LANG_DEPENDENT_REGEXES = [
|
||||
PUNCT,
|
||||
PERIOD_COMMA_PRECEED,
|
||||
PERIOD_COMMA_FOLLOW,
|
||||
DASH_PRECEED_DIGIT,
|
||||
]
|
||||
|
||||
# Perluniprops characters used in NIST tokenizer.
|
||||
pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N}
|
||||
pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P}
|
||||
pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S}
|
||||
|
||||
# Python regexes needs to escape some special symbols, see
|
||||
# see https://stackoverflow.com/q/45670950/610569
|
||||
number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
|
||||
punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
|
||||
symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)
|
||||
|
||||
# Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
|
||||
# (i) strip trailing and heading spaces and
|
||||
# (ii) de-deuplicate spaces.
|
||||
# In Python, this would do: ' '.join(str.strip().split())
|
||||
# Thus, the next two lines were commented out.
|
||||
# Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
|
||||
# Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
|
||||
|
||||
# Pads non-ascii strings with space.
|
||||
NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
|
||||
# Tokenize any punctuation unless followed AND preceded by a digit.
|
||||
PUNCT_1 = (
|
||||
re.compile(f"([{number_regex}])([{punct_regex}])"),
|
||||
"\\1 \\2 ",
|
||||
)
|
||||
PUNCT_2 = (
|
||||
re.compile(f"([{punct_regex}])([{number_regex}])"),
|
||||
" \\1 \\2",
|
||||
)
|
||||
# Tokenize symbols
|
||||
SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 "
|
||||
|
||||
INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
|
||||
|
||||
def lang_independent_sub(self, text):
|
||||
"""Performs the language independent string substituitions."""
|
||||
# It's a strange order of regexes.
|
||||
# It'll be better to unescape after STRIP_EOL_HYPHEN
|
||||
# but let's keep it close to the original NIST implementation.
|
||||
regexp, substitution = self.STRIP_SKIP
|
||||
text = regexp.sub(substitution, text)
|
||||
text = xml_unescape(text)
|
||||
regexp, substitution = self.STRIP_EOL_HYPHEN
|
||||
text = regexp.sub(substitution, text)
|
||||
return text
|
||||
|
||||
def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
|
||||
text = str(text)
|
||||
# Language independent regex.
|
||||
text = self.lang_independent_sub(text)
|
||||
# Language dependent regex.
|
||||
if western_lang:
|
||||
# Pad string with whitespace.
|
||||
text = " " + text + " "
|
||||
if lowercase:
|
||||
text = text.lower()
|
||||
for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
|
||||
text = regexp.sub(substitution, text)
|
||||
# Remove contiguous whitespaces.
|
||||
text = " ".join(text.split())
|
||||
# Finally, strips heading and trailing spaces
|
||||
# and converts output string into unicode.
|
||||
text = str(text.strip())
|
||||
return text if return_str else text.split()
|
||||
|
||||
def international_tokenize(
|
||||
self, text, lowercase=False, split_non_ascii=True, return_str=False
|
||||
):
|
||||
text = str(text)
|
||||
# Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
|
||||
# first before unescaping.
|
||||
regexp, substitution = self.STRIP_SKIP
|
||||
text = regexp.sub(substitution, text)
|
||||
regexp, substitution = self.STRIP_EOL_HYPHEN
|
||||
text = regexp.sub(substitution, text)
|
||||
text = xml_unescape(text)
|
||||
|
||||
if lowercase:
|
||||
text = text.lower()
|
||||
|
||||
for regexp, substitution in self.INTERNATIONAL_REGEXES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Make sure that there's only one space only between words.
|
||||
# Strip leading and trailing spaces.
|
||||
text = " ".join(text.strip().split())
|
||||
return text if return_str else text.split()
|
||||
1826
backend/venv/Lib/site-packages/nltk/tokenize/punkt.py
Normal file
1826
backend/venv/Lib/site-packages/nltk/tokenize/punkt.py
Normal file
File diff suppressed because it is too large
Load Diff
220
backend/venv/Lib/site-packages/nltk/tokenize/regexp.py
Normal file
220
backend/venv/Lib/site-packages/nltk/tokenize/regexp.py
Normal file
@@ -0,0 +1,220 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Trevor Cohn <tacohn@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
r"""
|
||||
Regular-Expression Tokenizers
|
||||
|
||||
A ``RegexpTokenizer`` splits a string into substrings using a regular expression.
|
||||
For example, the following tokenizer forms tokens out of alphabetic sequences,
|
||||
money expressions, and any other non-whitespace sequences:
|
||||
|
||||
>>> from nltk.tokenize import RegexpTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
|
||||
>>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
|
||||
A ``RegexpTokenizer`` can use its regexp to match delimiters instead:
|
||||
|
||||
>>> tokenizer = RegexpTokenizer(r'\s+', gaps=True)
|
||||
>>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
|
||||
|
||||
Note that empty tokens are not returned when the delimiter appears at
|
||||
the start or end of the string.
|
||||
|
||||
The material between the tokens is discarded. For example,
|
||||
the following tokenizer selects just the capitalized words:
|
||||
|
||||
>>> capword_tokenizer = RegexpTokenizer(r'[A-Z]\w+')
|
||||
>>> capword_tokenizer.tokenize(s)
|
||||
['Good', 'New', 'York', 'Please', 'Thanks']
|
||||
|
||||
This module contains several subclasses of ``RegexpTokenizer``
|
||||
that use pre-defined regular expressions.
|
||||
|
||||
>>> from nltk.tokenize import BlanklineTokenizer
|
||||
>>> # Uses '\s*\n\s*\n\s*':
|
||||
>>> BlanklineTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.',
|
||||
'Thanks.']
|
||||
|
||||
All of the regular expression tokenizers are also available as functions:
|
||||
|
||||
>>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
|
||||
>>> regexp_tokenize(s, pattern=r'\w+|\$[\d\.]+|\S+') # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
>>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
|
||||
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
>>> blankline_tokenize(s)
|
||||
['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.']
|
||||
|
||||
Caution: The function ``regexp_tokenize()`` takes the text as its
|
||||
first argument, and the regular expression pattern as its second
|
||||
argument. This differs from the conventions used by Python's
|
||||
``re`` functions, where the pattern is always the first argument.
|
||||
(This is for consistency with the other NLTK tokenizers.)
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.tokenize.util import regexp_span_tokenize
|
||||
|
||||
|
||||
class RegexpTokenizer(TokenizerI):
|
||||
r"""
|
||||
A tokenizer that splits a string using a regular expression, which
|
||||
matches either the tokens or the separators between tokens.
|
||||
|
||||
>>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
|
||||
|
||||
:type pattern: str
|
||||
:param pattern: The pattern used to build this tokenizer.
|
||||
(This pattern must not contain capturing parentheses;
|
||||
Use non-capturing parentheses, e.g. (?:...), instead)
|
||||
:type gaps: bool
|
||||
:param gaps: True if this tokenizer's pattern should be used
|
||||
to find separators between tokens; False if this
|
||||
tokenizer's pattern should be used to find the tokens
|
||||
themselves.
|
||||
:type discard_empty: bool
|
||||
:param discard_empty: True if any empty tokens `''`
|
||||
generated by the tokenizer should be discarded. Empty
|
||||
tokens can only be generated if `_gaps == True`.
|
||||
:type flags: int
|
||||
:param flags: The regexp flags used to compile this
|
||||
tokenizer's pattern. By default, the following flags are
|
||||
used: `re.UNICODE | re.MULTILINE | re.DOTALL`.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pattern,
|
||||
gaps=False,
|
||||
discard_empty=True,
|
||||
flags=re.UNICODE | re.MULTILINE | re.DOTALL,
|
||||
):
|
||||
# If they gave us a regexp object, extract the pattern.
|
||||
pattern = getattr(pattern, "pattern", pattern)
|
||||
|
||||
self._pattern = pattern
|
||||
self._gaps = gaps
|
||||
self._discard_empty = discard_empty
|
||||
self._flags = flags
|
||||
self._regexp = None
|
||||
|
||||
def _check_regexp(self):
|
||||
if self._regexp is None:
|
||||
self._regexp = re.compile(self._pattern, self._flags)
|
||||
|
||||
def tokenize(self, text):
|
||||
self._check_regexp()
|
||||
# If our regexp matches gaps, use re.split:
|
||||
if self._gaps:
|
||||
if self._discard_empty:
|
||||
return [tok for tok in self._regexp.split(text) if tok]
|
||||
else:
|
||||
return self._regexp.split(text)
|
||||
|
||||
# If our regexp matches tokens, use re.findall:
|
||||
else:
|
||||
return self._regexp.findall(text)
|
||||
|
||||
def span_tokenize(self, text):
|
||||
self._check_regexp()
|
||||
|
||||
if self._gaps:
|
||||
for left, right in regexp_span_tokenize(text, self._regexp):
|
||||
if not (self._discard_empty and left == right):
|
||||
yield left, right
|
||||
else:
|
||||
for m in re.finditer(self._regexp, text):
|
||||
yield m.span()
|
||||
|
||||
def __repr__(self):
|
||||
return "{}(pattern={!r}, gaps={!r}, discard_empty={!r}, flags={!r})".format(
|
||||
self.__class__.__name__,
|
||||
self._pattern,
|
||||
self._gaps,
|
||||
self._discard_empty,
|
||||
self._flags,
|
||||
)
|
||||
|
||||
|
||||
class WhitespaceTokenizer(RegexpTokenizer):
|
||||
r"""
|
||||
Tokenize a string on whitespace (space, tab, newline).
|
||||
In general, users should use the string ``split()`` method instead.
|
||||
|
||||
>>> from nltk.tokenize import WhitespaceTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> WhitespaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
RegexpTokenizer.__init__(self, r"\s+", gaps=True)
|
||||
|
||||
|
||||
class BlanklineTokenizer(RegexpTokenizer):
|
||||
"""
|
||||
Tokenize a string, treating any sequence of blank lines as a delimiter.
|
||||
Blank lines are defined as lines containing no characters, except for
|
||||
space or tab characters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True)
|
||||
|
||||
|
||||
class WordPunctTokenizer(RegexpTokenizer):
|
||||
r"""
|
||||
Tokenize a text into a sequence of alphabetic and
|
||||
non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.
|
||||
|
||||
>>> from nltk.tokenize import WordPunctTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> WordPunctTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
|
||||
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+")
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Tokenization Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
def regexp_tokenize(
|
||||
text,
|
||||
pattern,
|
||||
gaps=False,
|
||||
discard_empty=True,
|
||||
flags=re.UNICODE | re.MULTILINE | re.DOTALL,
|
||||
):
|
||||
"""
|
||||
Return a tokenized copy of *text*. See :class:`.RegexpTokenizer`
|
||||
for descriptions of the arguments.
|
||||
"""
|
||||
tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
|
||||
return tokenizer.tokenize(text)
|
||||
|
||||
|
||||
blankline_tokenize = BlanklineTokenizer().tokenize
|
||||
wordpunct_tokenize = WordPunctTokenizer().tokenize
|
||||
149
backend/venv/Lib/site-packages/nltk/tokenize/repp.py
Normal file
149
backend/venv/Lib/site-packages/nltk/tokenize/repp.py
Normal file
@@ -0,0 +1,149 @@
|
||||
# Natural Language Toolkit: Interface to the Repp Tokenizer
|
||||
#
|
||||
# Copyright (C) 2001-2015 NLTK Project
|
||||
# Authors: Rebecca Dridan and Stephan Oepen
|
||||
# Contributors: Liling Tan
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from nltk.data import ZipFilePathPointer
|
||||
from nltk.internals import find_dir
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
|
||||
class ReppTokenizer(TokenizerI):
|
||||
"""
|
||||
A class for word tokenization using the REPP parser described in
|
||||
Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
|
||||
Long Solved Problem - A Survey, Contrastive Experiment, Recommendations,
|
||||
and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406
|
||||
|
||||
>>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
|
||||
... 'But rule-based tokenizers are hard to maintain and their rules language specific.' ,
|
||||
... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.'
|
||||
... ]
|
||||
>>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
|
||||
>>> for sent in sents: # doctest: +SKIP
|
||||
... tokenizer.tokenize(sent) # doctest: +SKIP
|
||||
...
|
||||
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
|
||||
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
|
||||
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
|
||||
|
||||
>>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
|
||||
... print(sent) # doctest: +SKIP
|
||||
...
|
||||
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
|
||||
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
|
||||
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
|
||||
>>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
|
||||
... print(sent) # doctest: +SKIP
|
||||
...
|
||||
[(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
|
||||
[(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
|
||||
[(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
|
||||
"""
|
||||
|
||||
def __init__(self, repp_dir, encoding="utf8"):
|
||||
self.repp_dir = self.find_repptokenizer(repp_dir)
|
||||
# Set a directory to store the temporary files.
|
||||
self.working_dir = tempfile.gettempdir()
|
||||
# Set an encoding for the input strings.
|
||||
self.encoding = encoding
|
||||
|
||||
def tokenize(self, sentence):
|
||||
"""
|
||||
Use Repp to tokenize a single sentence.
|
||||
|
||||
:param sentence: A single sentence string.
|
||||
:type sentence: str
|
||||
:return: A tuple of tokens.
|
||||
:rtype: tuple(str)
|
||||
"""
|
||||
return next(self.tokenize_sents([sentence]))
|
||||
|
||||
def tokenize_sents(self, sentences, keep_token_positions=False):
|
||||
"""
|
||||
Tokenize multiple sentences using Repp.
|
||||
|
||||
:param sentences: A list of sentence strings.
|
||||
:type sentences: list(str)
|
||||
:return: A list of tuples of tokens
|
||||
:rtype: iter(tuple(str))
|
||||
"""
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="repp_input.", dir=self.working_dir, mode="w", delete=False
|
||||
) as input_file:
|
||||
# Write sentences to temporary input file.
|
||||
for sent in sentences:
|
||||
input_file.write(str(sent) + "\n")
|
||||
input_file.close()
|
||||
# Generate command to run REPP.
|
||||
cmd = self.generate_repp_command(input_file.name)
|
||||
# Decode the stdout and strips the ending newline.
|
||||
repp_output = self._execute(cmd).decode(self.encoding).strip()
|
||||
for tokenized_sent in self.parse_repp_outputs(repp_output):
|
||||
if not keep_token_positions:
|
||||
# Removes token position information.
|
||||
tokenized_sent, starts, ends = zip(*tokenized_sent)
|
||||
yield tokenized_sent
|
||||
|
||||
def generate_repp_command(self, inputfilename):
|
||||
"""
|
||||
This module generates the REPP command to be used at the terminal.
|
||||
|
||||
:param inputfilename: path to the input file
|
||||
:type inputfilename: str
|
||||
"""
|
||||
cmd = [self.repp_dir + "/src/repp"]
|
||||
cmd += ["-c", self.repp_dir + "/erg/repp.set"]
|
||||
cmd += ["--format", "triple"]
|
||||
cmd += [inputfilename]
|
||||
return cmd
|
||||
|
||||
@staticmethod
|
||||
def _execute(cmd):
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = p.communicate()
|
||||
return stdout
|
||||
|
||||
@staticmethod
|
||||
def parse_repp_outputs(repp_output):
|
||||
"""
|
||||
This module parses the tri-tuple format that REPP outputs using the
|
||||
"--format triple" option and returns an generator with tuple of string
|
||||
tokens.
|
||||
|
||||
:param repp_output:
|
||||
:type repp_output: type
|
||||
:return: an iterable of the tokenized sentences as tuples of strings
|
||||
:rtype: iter(tuple)
|
||||
"""
|
||||
line_regex = re.compile(r"^\((\d+), (\d+), (.+)\)$", re.MULTILINE)
|
||||
for section in repp_output.split("\n\n"):
|
||||
words_with_positions = [
|
||||
(token, int(start), int(end))
|
||||
for start, end, token in line_regex.findall(section)
|
||||
]
|
||||
words = tuple(t[2] for t in words_with_positions)
|
||||
yield words_with_positions
|
||||
|
||||
def find_repptokenizer(self, repp_dirname):
|
||||
"""
|
||||
A module to find REPP tokenizer binary and its *repp.set* config file.
|
||||
"""
|
||||
if os.path.exists(repp_dirname): # If a full path is given.
|
||||
_repp_dir = repp_dirname
|
||||
else: # Try to find path to REPP directory in environment variables.
|
||||
_repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",))
|
||||
# Checks for the REPP binary and erg/repp.set config file.
|
||||
assert os.path.exists(_repp_dir + "/src/repp")
|
||||
assert os.path.exists(_repp_dir + "/erg/repp.set")
|
||||
return _repp_dir
|
||||
140
backend/venv/Lib/site-packages/nltk/tokenize/sexpr.py
Normal file
140
backend/venv/Lib/site-packages/nltk/tokenize/sexpr.py
Normal file
@@ -0,0 +1,140 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor edits)
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
S-Expression Tokenizer
|
||||
|
||||
``SExprTokenizer`` is used to find parenthesized expressions in a
|
||||
string. In particular, it divides a string into a sequence of
|
||||
substrings that are either parenthesized expressions (including any
|
||||
nested parenthesized expressions), or other whitespace-separated
|
||||
tokens.
|
||||
|
||||
>>> from nltk.tokenize import SExprTokenizer
|
||||
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
|
||||
['(a b (c d))', 'e', 'f', '(g)']
|
||||
|
||||
By default, `SExprTokenizer` will raise a ``ValueError`` exception if
|
||||
used to tokenize an expression with non-matching parentheses:
|
||||
|
||||
>>> SExprTokenizer().tokenize('c) d) e (f (g')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Un-matched close paren at char 1
|
||||
|
||||
The ``strict`` argument can be set to False to allow for
|
||||
non-matching parentheses. Any unmatched close parentheses will be
|
||||
listed as their own s-expression; and the last partial sexpr with
|
||||
unmatched open parentheses will be listed as its own sexpr:
|
||||
|
||||
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
|
||||
['c', ')', 'd', ')', 'e', '(f (g']
|
||||
|
||||
The characters used for open and close parentheses may be customized
|
||||
using the ``parens`` argument to the `SExprTokenizer` constructor:
|
||||
|
||||
>>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
|
||||
['{a b {c d}}', 'e', 'f', '{g}']
|
||||
|
||||
The s-expression tokenizer is also available as a function:
|
||||
|
||||
>>> from nltk.tokenize import sexpr_tokenize
|
||||
>>> sexpr_tokenize('(a b (c d)) e f (g)')
|
||||
['(a b (c d))', 'e', 'f', '(g)']
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
|
||||
class SExprTokenizer(TokenizerI):
|
||||
"""
|
||||
A tokenizer that divides strings into s-expressions.
|
||||
An s-expresion can be either:
|
||||
|
||||
- a parenthesized expression, including any nested parenthesized
|
||||
expressions, or
|
||||
- a sequence of non-whitespace non-parenthesis characters.
|
||||
|
||||
For example, the string ``(a (b c)) d e (f)`` consists of four
|
||||
s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.
|
||||
|
||||
By default, the characters ``(`` and ``)`` are treated as open and
|
||||
close parentheses, but alternative strings may be specified.
|
||||
|
||||
:param parens: A two-element sequence specifying the open and close parentheses
|
||||
that should be used to find sexprs. This will typically be either a
|
||||
two-character string, or a list of two strings.
|
||||
:type parens: str or list
|
||||
:param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
|
||||
"""
|
||||
|
||||
def __init__(self, parens="()", strict=True):
|
||||
if len(parens) != 2:
|
||||
raise ValueError("parens must contain exactly two strings")
|
||||
self._strict = strict
|
||||
self._open_paren = parens[0]
|
||||
self._close_paren = parens[1]
|
||||
self._paren_regexp = re.compile(
|
||||
f"{re.escape(parens[0])}|{re.escape(parens[1])}"
|
||||
)
|
||||
|
||||
def tokenize(self, text):
|
||||
"""
|
||||
Return a list of s-expressions extracted from *text*.
|
||||
For example:
|
||||
|
||||
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
|
||||
['(a b (c d))', 'e', 'f', '(g)']
|
||||
|
||||
All parentheses are assumed to mark s-expressions.
|
||||
(No special processing is done to exclude parentheses that occur
|
||||
inside strings, or following backslash characters.)
|
||||
|
||||
If the given expression contains non-matching parentheses,
|
||||
then the behavior of the tokenizer depends on the ``strict``
|
||||
parameter to the constructor. If ``strict`` is ``True``, then
|
||||
raise a ``ValueError``. If ``strict`` is ``False``, then any
|
||||
unmatched close parentheses will be listed as their own
|
||||
s-expression; and the last partial s-expression with unmatched open
|
||||
parentheses will be listed as its own s-expression:
|
||||
|
||||
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
|
||||
['c', ')', 'd', ')', 'e', '(f (g']
|
||||
|
||||
:param text: the string to be tokenized
|
||||
:type text: str or iter(str)
|
||||
:rtype: iter(str)
|
||||
"""
|
||||
result = []
|
||||
pos = 0
|
||||
depth = 0
|
||||
for m in self._paren_regexp.finditer(text):
|
||||
paren = m.group()
|
||||
if depth == 0:
|
||||
result += text[pos : m.start()].split()
|
||||
pos = m.start()
|
||||
if paren == self._open_paren:
|
||||
depth += 1
|
||||
if paren == self._close_paren:
|
||||
if self._strict and depth == 0:
|
||||
raise ValueError("Un-matched close paren at char %d" % m.start())
|
||||
depth = max(0, depth - 1)
|
||||
if depth == 0:
|
||||
result.append(text[pos : m.end()])
|
||||
pos = m.end()
|
||||
if self._strict and depth > 0:
|
||||
raise ValueError("Un-matched open paren at char %d" % pos)
|
||||
if pos < len(text):
|
||||
result.append(text[pos:])
|
||||
return result
|
||||
|
||||
|
||||
sexpr_tokenize = SExprTokenizer().tokenize
|
||||
139
backend/venv/Lib/site-packages/nltk/tokenize/simple.py
Normal file
139
backend/venv/Lib/site-packages/nltk/tokenize/simple.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# Natural Language Toolkit: Simple Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
r"""
|
||||
Simple Tokenizers
|
||||
|
||||
These tokenizers divide strings into substrings using the string
|
||||
``split()`` method.
|
||||
When tokenizing using a particular delimiter string, use
|
||||
the string ``split()`` method directly, as this is more efficient.
|
||||
|
||||
The simple tokenizers are *not* available as separate functions;
|
||||
instead, you should just use the string ``split()`` method directly:
|
||||
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> s.split() # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
|
||||
>>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
|
||||
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
|
||||
>>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88', 'in New York. Please buy me',
|
||||
'two of them.', '', 'Thanks.']
|
||||
|
||||
The simple tokenizers are mainly useful because they follow the
|
||||
standard ``TokenizerI`` interface, and so can be used with any code
|
||||
that expects a tokenizer. For example, these tokenizers can be used
|
||||
to specify the tokenization conventions when building a `CorpusReader`.
|
||||
|
||||
"""
|
||||
|
||||
from nltk.tokenize.api import StringTokenizer, TokenizerI
|
||||
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
|
||||
|
||||
|
||||
class SpaceTokenizer(StringTokenizer):
|
||||
r"""Tokenize a string using the space character as a delimiter,
|
||||
which is the same as ``s.split(' ')``.
|
||||
|
||||
>>> from nltk.tokenize import SpaceTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
|
||||
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
|
||||
"""
|
||||
|
||||
_string = " "
|
||||
|
||||
|
||||
class TabTokenizer(StringTokenizer):
|
||||
r"""Tokenize a string use the tab character as a delimiter,
|
||||
the same as ``s.split('\t')``.
|
||||
|
||||
>>> from nltk.tokenize import TabTokenizer
|
||||
>>> TabTokenizer().tokenize('a\tb c\n\t d')
|
||||
['a', 'b c\n', ' d']
|
||||
"""
|
||||
|
||||
_string = "\t"
|
||||
|
||||
|
||||
class CharTokenizer(StringTokenizer):
|
||||
"""Tokenize a string into individual characters. If this functionality
|
||||
is ever required directly, use ``for char in string``.
|
||||
"""
|
||||
|
||||
_string = None
|
||||
|
||||
def tokenize(self, s):
|
||||
return list(s)
|
||||
|
||||
def span_tokenize(self, s):
|
||||
yield from enumerate(range(1, len(s) + 1))
|
||||
|
||||
|
||||
class LineTokenizer(TokenizerI):
|
||||
r"""Tokenize a string into its lines, optionally discarding blank lines.
|
||||
This is similar to ``s.split('\n')``.
|
||||
|
||||
>>> from nltk.tokenize import LineTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88', 'in New York. Please buy me',
|
||||
'two of them.', '', 'Thanks.']
|
||||
>>> # same as [l for l in s.split('\n') if l.strip()]:
|
||||
>>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88', 'in New York. Please buy me',
|
||||
'two of them.', 'Thanks.']
|
||||
|
||||
:param blanklines: Indicates how blank lines should be handled. Valid values are:
|
||||
|
||||
- ``discard``: strip blank lines out of the token list before returning it.
|
||||
A line is considered blank if it contains only whitespace characters.
|
||||
- ``keep``: leave all blank lines in the token list.
|
||||
- ``discard-eof``: if the string ends with a newline, then do not generate
|
||||
a corresponding token ``''`` after that newline.
|
||||
"""
|
||||
|
||||
def __init__(self, blanklines="discard"):
|
||||
valid_blanklines = ("discard", "keep", "discard-eof")
|
||||
if blanklines not in valid_blanklines:
|
||||
raise ValueError(
|
||||
"Blank lines must be one of: %s" % " ".join(valid_blanklines)
|
||||
)
|
||||
|
||||
self._blanklines = blanklines
|
||||
|
||||
def tokenize(self, s):
|
||||
lines = s.splitlines()
|
||||
# If requested, strip off blank lines.
|
||||
if self._blanklines == "discard":
|
||||
lines = [l for l in lines if l.rstrip()]
|
||||
elif self._blanklines == "discard-eof":
|
||||
if lines and not lines[-1].strip():
|
||||
lines.pop()
|
||||
return lines
|
||||
|
||||
# discard-eof not implemented
|
||||
def span_tokenize(self, s):
|
||||
if self._blanklines == "keep":
|
||||
yield from string_span_tokenize(s, r"\n")
|
||||
else:
|
||||
yield from regexp_span_tokenize(s, r"\n(\s+\n)*")
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Tokenization Functions
|
||||
######################################################################
|
||||
# XXX: it is stated in module docs that there is no function versions
|
||||
|
||||
|
||||
def line_tokenize(text, blanklines="discard"):
|
||||
return LineTokenizer(blanklines).tokenize(text)
|
||||
@@ -0,0 +1,194 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Christopher Hench <chris.l.hench@gmail.com>
|
||||
# Alex Estes
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed
|
||||
by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the
|
||||
openness of the lips. Syllable breaks occur before troughs in sonority. For more
|
||||
on the SSP see Selkirk (1984).
|
||||
|
||||
The default implementation uses the English alphabet, but the `sonority_hiearchy`
|
||||
can be modified to IPA or any other alphabet for the use-case. The SSP is a
|
||||
universal syllabification algorithm, but that does not mean it performs equally
|
||||
across languages. Bartlett et al. (2009) is a good benchmark for English accuracy
|
||||
if utilizing IPA (pg. 311).
|
||||
|
||||
Importantly, if a custom hierarchy is supplied and vowels span across more than
|
||||
one level, they should be given separately to the `vowels` class attribute.
|
||||
|
||||
References:
|
||||
|
||||
- Otto Jespersen. 1904. Lehrbuch der Phonetik.
|
||||
Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
|
||||
- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
|
||||
In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
|
||||
Cambridge, MIT Press. pp. 107-136.
|
||||
- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
|
||||
In HLT-NAACL. pp. 308-316.
|
||||
"""
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from string import punctuation
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.util import ngrams
|
||||
|
||||
|
||||
class SyllableTokenizer(TokenizerI):
|
||||
"""
|
||||
Syllabifies words based on the Sonority Sequencing Principle (SSP).
|
||||
|
||||
>>> from nltk.tokenize import SyllableTokenizer
|
||||
>>> from nltk import word_tokenize
|
||||
>>> SSP = SyllableTokenizer()
|
||||
>>> SSP.tokenize('justification')
|
||||
['jus', 'ti', 'fi', 'ca', 'tion']
|
||||
>>> text = "This is a foobar-like sentence."
|
||||
>>> [SSP.tokenize(token) for token in word_tokenize(text)]
|
||||
[['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
|
||||
"""
|
||||
|
||||
def __init__(self, lang="en", sonority_hierarchy=False):
|
||||
"""
|
||||
:param lang: Language parameter, default is English, 'en'
|
||||
:type lang: str
|
||||
:param sonority_hierarchy: Sonority hierarchy according to the
|
||||
Sonority Sequencing Principle.
|
||||
:type sonority_hierarchy: list(str)
|
||||
"""
|
||||
# Sonority hierarchy should be provided in descending order.
|
||||
# If vowels are spread across multiple levels, they should be
|
||||
# passed assigned self.vowels var together, otherwise should be
|
||||
# placed in first index of hierarchy.
|
||||
if not sonority_hierarchy and lang == "en":
|
||||
sonority_hierarchy = [
|
||||
"aeiouy", # vowels.
|
||||
"lmnrw", # nasals.
|
||||
"zvsf", # fricatives.
|
||||
"bcdgtkpqxhj", # stops.
|
||||
]
|
||||
|
||||
self.vowels = sonority_hierarchy[0]
|
||||
self.phoneme_map = {}
|
||||
for i, level in enumerate(sonority_hierarchy):
|
||||
for c in level:
|
||||
sonority_level = len(sonority_hierarchy) - i
|
||||
self.phoneme_map[c] = sonority_level
|
||||
self.phoneme_map[c.upper()] = sonority_level
|
||||
|
||||
def assign_values(self, token):
|
||||
"""
|
||||
Assigns each phoneme its value from the sonority hierarchy.
|
||||
Note: Sentence/text has to be tokenized first.
|
||||
|
||||
:param token: Single word or token
|
||||
:type token: str
|
||||
:return: List of tuples, first element is character/phoneme and
|
||||
second is the soronity value.
|
||||
:rtype: list(tuple(str, int))
|
||||
"""
|
||||
syllables_values = []
|
||||
for c in token:
|
||||
try:
|
||||
syllables_values.append((c, self.phoneme_map[c]))
|
||||
except KeyError:
|
||||
if c not in "0123456789" and c not in punctuation:
|
||||
warnings.warn(
|
||||
"Character not defined in sonority_hierarchy,"
|
||||
" assigning as vowel: '{}'".format(c)
|
||||
)
|
||||
syllables_values.append((c, max(self.phoneme_map.values())))
|
||||
if c not in self.vowels:
|
||||
self.vowels += c
|
||||
else: # If it's a punctuation or numbers, assign -1.
|
||||
syllables_values.append((c, -1))
|
||||
return syllables_values
|
||||
|
||||
def validate_syllables(self, syllable_list):
|
||||
"""
|
||||
Ensures each syllable has at least one vowel.
|
||||
If the following syllable doesn't have vowel, add it to the current one.
|
||||
|
||||
:param syllable_list: Single word or token broken up into syllables.
|
||||
:type syllable_list: list(str)
|
||||
:return: Single word or token broken up into syllables
|
||||
(with added syllables if necessary)
|
||||
:rtype: list(str)
|
||||
"""
|
||||
valid_syllables = []
|
||||
front = ""
|
||||
vowel_pattern = re.compile("|".join(self.vowels))
|
||||
for i, syllable in enumerate(syllable_list):
|
||||
if syllable in punctuation:
|
||||
valid_syllables.append(syllable)
|
||||
continue
|
||||
if not vowel_pattern.search(syllable):
|
||||
if len(valid_syllables) == 0:
|
||||
front += syllable
|
||||
else:
|
||||
valid_syllables = valid_syllables[:-1] + [
|
||||
valid_syllables[-1] + syllable
|
||||
]
|
||||
else:
|
||||
if len(valid_syllables) == 0:
|
||||
valid_syllables.append(front + syllable)
|
||||
else:
|
||||
valid_syllables.append(syllable)
|
||||
|
||||
return valid_syllables
|
||||
|
||||
def tokenize(self, token):
|
||||
"""
|
||||
Apply the SSP to return a list of syllables.
|
||||
Note: Sentence/text has to be tokenized first.
|
||||
|
||||
:param token: Single word or token
|
||||
:type token: str
|
||||
:return syllable_list: Single word or token broken up into syllables.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
# assign values from hierarchy
|
||||
syllables_values = self.assign_values(token)
|
||||
|
||||
# if only one vowel return word
|
||||
if sum(token.count(x) for x in self.vowels) <= 1:
|
||||
return [token]
|
||||
|
||||
syllable_list = []
|
||||
syllable = syllables_values[0][0] # start syllable with first phoneme
|
||||
for trigram in ngrams(syllables_values, n=3):
|
||||
phonemes, values = zip(*trigram)
|
||||
# Sonority of previous, focal and following phoneme
|
||||
prev_value, focal_value, next_value = values
|
||||
# Focal phoneme.
|
||||
focal_phoneme = phonemes[1]
|
||||
|
||||
# These cases trigger syllable break.
|
||||
if focal_value == -1: # If it's a punctuation, just break.
|
||||
syllable_list.append(syllable)
|
||||
syllable_list.append(focal_phoneme)
|
||||
syllable = ""
|
||||
elif prev_value >= focal_value == next_value:
|
||||
syllable += focal_phoneme
|
||||
syllable_list.append(syllable)
|
||||
syllable = ""
|
||||
|
||||
elif prev_value > focal_value < next_value:
|
||||
syllable_list.append(syllable)
|
||||
syllable = ""
|
||||
syllable += focal_phoneme
|
||||
|
||||
# no syllable break
|
||||
else:
|
||||
syllable += focal_phoneme
|
||||
|
||||
syllable += syllables_values[-1][0] # append last phoneme
|
||||
syllable_list.append(syllable)
|
||||
|
||||
return self.validate_syllables(syllable_list)
|
||||
115
backend/venv/Lib/site-packages/nltk/tokenize/stanford.py
Normal file
115
backend/venv/Lib/site-packages/nltk/tokenize/stanford.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# Natural Language Toolkit: Interface to the Stanford Tokenizer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Xu <xxu@student.unimelb.edu.au>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import warnings
|
||||
from subprocess import PIPE
|
||||
|
||||
from nltk.internals import _java_options, config_java, find_jar, java
|
||||
from nltk.parse.corenlp import CoreNLPParser
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
_stanford_url = "https://nlp.stanford.edu/software/tokenizer.shtml"
|
||||
|
||||
|
||||
class StanfordTokenizer(TokenizerI):
|
||||
r"""
|
||||
Interface to the Stanford Tokenizer
|
||||
|
||||
>>> from nltk.tokenize.stanford import StanfordTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
|
||||
>>> StanfordTokenizer().tokenize(s) # doctest: +SKIP
|
||||
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
>>> s = "The colour of the wall is blue."
|
||||
>>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP
|
||||
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
|
||||
"""
|
||||
|
||||
_JAR = "stanford-postagger.jar"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_to_jar=None,
|
||||
encoding="utf8",
|
||||
options=None,
|
||||
verbose=False,
|
||||
java_options="-mx1000m",
|
||||
):
|
||||
# Raise deprecation warning.
|
||||
warnings.warn(
|
||||
str(
|
||||
"\nThe StanfordTokenizer will "
|
||||
"be deprecated in version 3.2.5.\n"
|
||||
"Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'"
|
||||
),
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
self._stanford_jar = find_jar(
|
||||
self._JAR,
|
||||
path_to_jar,
|
||||
env_vars=("STANFORD_POSTAGGER",),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
self._encoding = encoding
|
||||
self.java_options = java_options
|
||||
|
||||
options = {} if options is None else options
|
||||
self._options_cmd = ",".join(f"{key}={val}" for key, val in options.items())
|
||||
|
||||
@staticmethod
|
||||
def _parse_tokenized_output(s):
|
||||
return s.splitlines()
|
||||
|
||||
def tokenize(self, s):
|
||||
"""
|
||||
Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
|
||||
"""
|
||||
cmd = ["edu.stanford.nlp.process.PTBTokenizer"]
|
||||
return self._parse_tokenized_output(self._execute(cmd, s))
|
||||
|
||||
def _execute(self, cmd, input_, verbose=False):
|
||||
encoding = self._encoding
|
||||
cmd.extend(["-charset", encoding])
|
||||
_options_cmd = self._options_cmd
|
||||
if _options_cmd:
|
||||
cmd.extend(["-options", self._options_cmd])
|
||||
|
||||
default_options = " ".join(_java_options)
|
||||
|
||||
# Configure java.
|
||||
config_java(options=self.java_options, verbose=verbose)
|
||||
|
||||
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
|
||||
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
|
||||
# Write the actual sentences to the temporary input file
|
||||
if isinstance(input_, str) and encoding:
|
||||
input_ = input_.encode(encoding)
|
||||
input_file.write(input_)
|
||||
input_file.flush()
|
||||
|
||||
cmd.append(input_file.name)
|
||||
|
||||
# Run the tagger and get the output.
|
||||
stdout, stderr = java(
|
||||
cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
|
||||
)
|
||||
stdout = stdout.decode(encoding)
|
||||
|
||||
os.unlink(input_file.name)
|
||||
|
||||
# Return java configurations to their default values.
|
||||
config_java(options=default_options, verbose=False)
|
||||
|
||||
return stdout
|
||||
@@ -0,0 +1,292 @@
|
||||
#!/usr/bin/env python
|
||||
# Natural Language Toolkit: Interface to the Stanford Segmenter
|
||||
# for Chinese and Arabic
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: 52nlp <52nlpcn@gmail.com>
|
||||
# Casper Lehmann-Strøm <casperlehmann@gmail.com>
|
||||
# Alex Constantin <alex@keyworder.ch>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import warnings
|
||||
from subprocess import PIPE
|
||||
|
||||
from nltk.internals import (
|
||||
_java_options,
|
||||
config_java,
|
||||
find_dir,
|
||||
find_file,
|
||||
find_jar,
|
||||
java,
|
||||
)
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
_stanford_url = "https://nlp.stanford.edu/software"
|
||||
|
||||
|
||||
class StanfordSegmenter(TokenizerI):
|
||||
"""Interface to the Stanford Segmenter
|
||||
|
||||
If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
|
||||
should be provieded, for example::
|
||||
|
||||
seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
|
||||
|
||||
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
|
||||
>>> seg = StanfordSegmenter() # doctest: +SKIP
|
||||
>>> seg.default_config('zh') # doctest: +SKIP
|
||||
>>> sent = u'这是斯坦福中文分词器测试'
|
||||
>>> print(seg.segment(sent)) # doctest: +SKIP
|
||||
\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
|
||||
<BLANKLINE>
|
||||
>>> seg.default_config('ar') # doctest: +SKIP
|
||||
>>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
|
||||
>>> print(seg.segment(sent.split())) # doctest: +SKIP
|
||||
\u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
|
||||
<BLANKLINE>
|
||||
"""
|
||||
|
||||
_JAR = "stanford-segmenter.jar"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_to_jar=None,
|
||||
path_to_slf4j=None,
|
||||
java_class=None,
|
||||
path_to_model=None,
|
||||
path_to_dict=None,
|
||||
path_to_sihan_corpora_dict=None,
|
||||
sihan_post_processing="false",
|
||||
keep_whitespaces="false",
|
||||
encoding="UTF-8",
|
||||
options=None,
|
||||
verbose=False,
|
||||
java_options="-mx2g",
|
||||
):
|
||||
# Raise deprecation warning.
|
||||
warnings.simplefilter("always", DeprecationWarning)
|
||||
warnings.warn(
|
||||
str(
|
||||
"\nThe StanfordTokenizer will "
|
||||
"be deprecated in version 3.2.5.\n"
|
||||
"Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
|
||||
),
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
|
||||
stanford_segmenter = find_jar(
|
||||
self._JAR,
|
||||
path_to_jar,
|
||||
env_vars=("STANFORD_SEGMENTER",),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
)
|
||||
if path_to_slf4j is not None:
|
||||
slf4j = find_jar(
|
||||
"slf4j-api.jar",
|
||||
path_to_slf4j,
|
||||
env_vars=("SLF4J", "STANFORD_SEGMENTER"),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
)
|
||||
else:
|
||||
slf4j = None
|
||||
|
||||
# This is passed to java as the -cp option, the old version of segmenter needs slf4j.
|
||||
# The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
|
||||
self._stanford_jar = os.pathsep.join(
|
||||
_ for _ in [stanford_segmenter, slf4j] if _ is not None
|
||||
)
|
||||
|
||||
self._java_class = java_class
|
||||
self._model = path_to_model
|
||||
self._sihan_corpora_dict = path_to_sihan_corpora_dict
|
||||
self._sihan_post_processing = sihan_post_processing
|
||||
self._keep_whitespaces = keep_whitespaces
|
||||
self._dict = path_to_dict
|
||||
|
||||
self._encoding = encoding
|
||||
self.java_options = java_options
|
||||
options = {} if options is None else options
|
||||
self._options_cmd = ",".join(
|
||||
f"{key}={json.dumps(val)}" for key, val in options.items()
|
||||
)
|
||||
|
||||
def default_config(self, lang):
|
||||
"""
|
||||
Attempt to initialize Stanford Word Segmenter for the specified language
|
||||
using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
|
||||
"""
|
||||
|
||||
search_path = ()
|
||||
if os.environ.get("STANFORD_SEGMENTER"):
|
||||
search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}
|
||||
|
||||
# init for Chinese-specific files
|
||||
self._dict = None
|
||||
self._sihan_corpora_dict = None
|
||||
self._sihan_post_processing = "false"
|
||||
|
||||
if lang == "ar":
|
||||
self._java_class = (
|
||||
"edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
|
||||
)
|
||||
model = "arabic-segmenter-atb+bn+arztrain.ser.gz"
|
||||
|
||||
elif lang == "zh":
|
||||
self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
|
||||
model = "pku.gz"
|
||||
self._sihan_post_processing = "true"
|
||||
|
||||
path_to_dict = "dict-chris6.ser.gz"
|
||||
try:
|
||||
self._dict = find_file(
|
||||
path_to_dict,
|
||||
searchpath=search_path,
|
||||
url=_stanford_url,
|
||||
verbose=False,
|
||||
env_vars=("STANFORD_MODELS",),
|
||||
)
|
||||
except LookupError as e:
|
||||
raise LookupError(
|
||||
"Could not find '%s' (tried using env. "
|
||||
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
|
||||
% path_to_dict
|
||||
) from e
|
||||
|
||||
sihan_dir = "./data/"
|
||||
try:
|
||||
path_to_sihan_dir = find_dir(
|
||||
sihan_dir,
|
||||
url=_stanford_url,
|
||||
verbose=False,
|
||||
env_vars=("STANFORD_SEGMENTER",),
|
||||
)
|
||||
self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
|
||||
except LookupError as e:
|
||||
raise LookupError(
|
||||
"Could not find '%s' (tried using the "
|
||||
"STANFORD_SEGMENTER environment variable)" % sihan_dir
|
||||
) from e
|
||||
else:
|
||||
raise LookupError(f"Unsupported language {lang}")
|
||||
|
||||
try:
|
||||
self._model = find_file(
|
||||
model,
|
||||
searchpath=search_path,
|
||||
url=_stanford_url,
|
||||
verbose=False,
|
||||
env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
|
||||
)
|
||||
except LookupError as e:
|
||||
raise LookupError(
|
||||
"Could not find '%s' (tried using env. "
|
||||
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
|
||||
) from e
|
||||
|
||||
def tokenize(self, s):
|
||||
super().tokenize(s)
|
||||
|
||||
def segment_file(self, input_file_path):
|
||||
""" """
|
||||
cmd = [
|
||||
self._java_class,
|
||||
"-loadClassifier",
|
||||
self._model,
|
||||
"-keepAllWhitespaces",
|
||||
self._keep_whitespaces,
|
||||
"-textFile",
|
||||
input_file_path,
|
||||
]
|
||||
if self._sihan_corpora_dict is not None:
|
||||
cmd.extend(
|
||||
[
|
||||
"-serDictionary",
|
||||
self._dict,
|
||||
"-sighanCorporaDict",
|
||||
self._sihan_corpora_dict,
|
||||
"-sighanPostProcessing",
|
||||
self._sihan_post_processing,
|
||||
]
|
||||
)
|
||||
|
||||
stdout = self._execute(cmd)
|
||||
|
||||
return stdout
|
||||
|
||||
def segment(self, tokens):
|
||||
return self.segment_sents([tokens])
|
||||
|
||||
def segment_sents(self, sentences):
|
||||
""" """
|
||||
encoding = self._encoding
|
||||
# Create a temporary input file
|
||||
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
|
||||
|
||||
# Write the actural sentences to the temporary input file
|
||||
_input_fh = os.fdopen(_input_fh, "wb")
|
||||
_input = "\n".join(" ".join(x) for x in sentences)
|
||||
if isinstance(_input, str) and encoding:
|
||||
_input = _input.encode(encoding)
|
||||
_input_fh.write(_input)
|
||||
_input_fh.close()
|
||||
|
||||
cmd = [
|
||||
self._java_class,
|
||||
"-loadClassifier",
|
||||
self._model,
|
||||
"-keepAllWhitespaces",
|
||||
self._keep_whitespaces,
|
||||
"-textFile",
|
||||
self._input_file_path,
|
||||
]
|
||||
if self._sihan_corpora_dict is not None:
|
||||
cmd.extend(
|
||||
[
|
||||
"-serDictionary",
|
||||
self._dict,
|
||||
"-sighanCorporaDict",
|
||||
self._sihan_corpora_dict,
|
||||
"-sighanPostProcessing",
|
||||
self._sihan_post_processing,
|
||||
]
|
||||
)
|
||||
|
||||
stdout = self._execute(cmd)
|
||||
|
||||
# Delete the temporary file
|
||||
os.unlink(self._input_file_path)
|
||||
|
||||
return stdout
|
||||
|
||||
def _execute(self, cmd, verbose=False):
|
||||
encoding = self._encoding
|
||||
cmd.extend(["-inputEncoding", encoding])
|
||||
_options_cmd = self._options_cmd
|
||||
if _options_cmd:
|
||||
cmd.extend(["-options", self._options_cmd])
|
||||
|
||||
default_options = " ".join(_java_options)
|
||||
|
||||
# Configure java.
|
||||
config_java(options=self.java_options, verbose=verbose)
|
||||
|
||||
stdout, _stderr = java(
|
||||
cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
|
||||
)
|
||||
stdout = stdout.decode(encoding)
|
||||
|
||||
# Return java configurations to their default values.
|
||||
config_java(options=default_options, verbose=False)
|
||||
|
||||
return stdout
|
||||
474
backend/venv/Lib/site-packages/nltk/tokenize/texttiling.py
Normal file
474
backend/venv/Lib/site-packages/nltk/tokenize/texttiling.py
Normal file
@@ -0,0 +1,474 @@
|
||||
# Natural Language Toolkit: TextTiling
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: George Boutsioukis
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import math
|
||||
import re
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
BLOCK_COMPARISON, VOCABULARY_INTRODUCTION = 0, 1
|
||||
LC, HC = 0, 1
|
||||
DEFAULT_SMOOTHING = [0]
|
||||
|
||||
|
||||
class TextTilingTokenizer(TokenizerI):
|
||||
"""Tokenize a document into topical sections using the TextTiling algorithm.
|
||||
This algorithm detects subtopic shifts based on the analysis of lexical
|
||||
co-occurrence patterns.
|
||||
|
||||
The process starts by tokenizing the text into pseudosentences of
|
||||
a fixed size w. Then, depending on the method used, similarity
|
||||
scores are assigned at sentence gaps. The algorithm proceeds by
|
||||
detecting the peak differences between these scores and marking
|
||||
them as boundaries. The boundaries are normalized to the closest
|
||||
paragraph break and the segmented text is returned.
|
||||
|
||||
:param w: Pseudosentence size
|
||||
:type w: int
|
||||
:param k: Size (in sentences) of the block used in the block comparison method
|
||||
:type k: int
|
||||
:param similarity_method: The method used for determining similarity scores:
|
||||
`BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`.
|
||||
:type similarity_method: constant
|
||||
:param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus)
|
||||
:type stopwords: list(str)
|
||||
:param smoothing_method: The method used for smoothing the score plot:
|
||||
`DEFAULT_SMOOTHING` (default)
|
||||
:type smoothing_method: constant
|
||||
:param smoothing_width: The width of the window used by the smoothing method
|
||||
:type smoothing_width: int
|
||||
:param smoothing_rounds: The number of smoothing passes
|
||||
:type smoothing_rounds: int
|
||||
:param cutoff_policy: The policy used to determine the number of boundaries:
|
||||
`HC` (default) or `LC`
|
||||
:type cutoff_policy: constant
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> tt = TextTilingTokenizer(demo_mode=True)
|
||||
>>> text = brown.raw()[:4000]
|
||||
>>> s, ss, d, b = tt.tokenize(text)
|
||||
>>> b
|
||||
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
w=20,
|
||||
k=10,
|
||||
similarity_method=BLOCK_COMPARISON,
|
||||
stopwords=None,
|
||||
smoothing_method=DEFAULT_SMOOTHING,
|
||||
smoothing_width=2,
|
||||
smoothing_rounds=1,
|
||||
cutoff_policy=HC,
|
||||
demo_mode=False,
|
||||
):
|
||||
if stopwords is None:
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
stopwords = stopwords.words("english")
|
||||
self.__dict__.update(locals())
|
||||
del self.__dict__["self"]
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Return a tokenized copy of *text*, where each "token" represents
|
||||
a separate topic."""
|
||||
|
||||
lowercase_text = text.lower()
|
||||
paragraph_breaks = self._mark_paragraph_breaks(text)
|
||||
text_length = len(lowercase_text)
|
||||
|
||||
# Tokenization step starts here
|
||||
|
||||
# Remove punctuation
|
||||
nopunct_text = "".join(
|
||||
c for c in lowercase_text if re.match(r"[a-z\-' \n\t]", c)
|
||||
)
|
||||
nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text)
|
||||
|
||||
tokseqs = self._divide_to_tokensequences(nopunct_text)
|
||||
|
||||
# The morphological stemming step mentioned in the TextTile
|
||||
# paper is not implemented. A comment in the original C
|
||||
# implementation states that it offers no benefit to the
|
||||
# process. It might be interesting to test the existing
|
||||
# stemmers though.
|
||||
# words = _stem_words(words)
|
||||
|
||||
# Filter stopwords
|
||||
for ts in tokseqs:
|
||||
ts.wrdindex_list = [
|
||||
wi for wi in ts.wrdindex_list if wi[0] not in self.stopwords
|
||||
]
|
||||
|
||||
token_table = self._create_token_table(tokseqs, nopunct_par_breaks)
|
||||
# End of the Tokenization step
|
||||
|
||||
# Lexical score determination
|
||||
if self.similarity_method == BLOCK_COMPARISON:
|
||||
gap_scores = self._block_comparison(tokseqs, token_table)
|
||||
elif self.similarity_method == VOCABULARY_INTRODUCTION:
|
||||
raise NotImplementedError("Vocabulary introduction not implemented")
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Similarity method {self.similarity_method} not recognized"
|
||||
)
|
||||
|
||||
if self.smoothing_method == DEFAULT_SMOOTHING:
|
||||
smooth_scores = self._smooth_scores(gap_scores)
|
||||
else:
|
||||
raise ValueError(f"Smoothing method {self.smoothing_method} not recognized")
|
||||
# End of Lexical score Determination
|
||||
|
||||
# Boundary identification
|
||||
depth_scores = self._depth_scores(smooth_scores)
|
||||
segment_boundaries = self._identify_boundaries(depth_scores)
|
||||
|
||||
normalized_boundaries = self._normalize_boundaries(
|
||||
text, segment_boundaries, paragraph_breaks
|
||||
)
|
||||
# End of Boundary Identification
|
||||
segmented_text = []
|
||||
prevb = 0
|
||||
|
||||
for b in normalized_boundaries:
|
||||
if b == 0:
|
||||
continue
|
||||
segmented_text.append(text[prevb:b])
|
||||
prevb = b
|
||||
|
||||
if prevb < text_length: # append any text that may be remaining
|
||||
segmented_text.append(text[prevb:])
|
||||
|
||||
if not segmented_text:
|
||||
segmented_text = [text]
|
||||
|
||||
if self.demo_mode:
|
||||
return gap_scores, smooth_scores, depth_scores, segment_boundaries
|
||||
return segmented_text
|
||||
|
||||
def _block_comparison(self, tokseqs, token_table):
|
||||
"""Implements the block comparison method"""
|
||||
|
||||
def blk_frq(tok, block):
|
||||
ts_occs = filter(lambda o: o[0] in block, token_table[tok].ts_occurences)
|
||||
freq = sum(tsocc[1] for tsocc in ts_occs)
|
||||
return freq
|
||||
|
||||
gap_scores = []
|
||||
numgaps = len(tokseqs) - 1
|
||||
|
||||
for curr_gap in range(numgaps):
|
||||
score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0
|
||||
score = 0.0
|
||||
# adjust window size for boundary conditions
|
||||
if curr_gap < self.k - 1:
|
||||
window_size = curr_gap + 1
|
||||
elif curr_gap > numgaps - self.k:
|
||||
window_size = numgaps - curr_gap
|
||||
else:
|
||||
window_size = self.k
|
||||
|
||||
b1 = [ts.index for ts in tokseqs[curr_gap - window_size + 1 : curr_gap + 1]]
|
||||
b2 = [ts.index for ts in tokseqs[curr_gap + 1 : curr_gap + window_size + 1]]
|
||||
|
||||
for t in token_table:
|
||||
score_dividend += blk_frq(t, b1) * blk_frq(t, b2)
|
||||
score_divisor_b1 += blk_frq(t, b1) ** 2
|
||||
score_divisor_b2 += blk_frq(t, b2) ** 2
|
||||
try:
|
||||
score = score_dividend / math.sqrt(score_divisor_b1 * score_divisor_b2)
|
||||
except ZeroDivisionError:
|
||||
pass # score += 0.0
|
||||
|
||||
gap_scores.append(score)
|
||||
|
||||
return gap_scores
|
||||
|
||||
def _smooth_scores(self, gap_scores):
|
||||
"Wraps the smooth function from the SciPy Cookbook"
|
||||
return list(
|
||||
smooth(numpy.array(gap_scores[:]), window_len=self.smoothing_width + 1)
|
||||
)
|
||||
|
||||
def _mark_paragraph_breaks(self, text):
|
||||
"""Identifies indented text or line breaks as the beginning of
|
||||
paragraphs"""
|
||||
MIN_PARAGRAPH = 100
|
||||
pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*")
|
||||
matches = pattern.finditer(text)
|
||||
|
||||
last_break = 0
|
||||
pbreaks = [0]
|
||||
for pb in matches:
|
||||
if pb.start() - last_break < MIN_PARAGRAPH:
|
||||
continue
|
||||
else:
|
||||
pbreaks.append(pb.start())
|
||||
last_break = pb.start()
|
||||
|
||||
return pbreaks
|
||||
|
||||
def _divide_to_tokensequences(self, text):
|
||||
"Divides the text into pseudosentences of fixed size"
|
||||
w = self.w
|
||||
wrdindex_list = []
|
||||
matches = re.finditer(r"\w+", text)
|
||||
for match in matches:
|
||||
wrdindex_list.append((match.group(), match.start()))
|
||||
return [
|
||||
TokenSequence(i / w, wrdindex_list[i : i + w])
|
||||
for i in range(0, len(wrdindex_list), w)
|
||||
]
|
||||
|
||||
def _create_token_table(self, token_sequences, par_breaks):
|
||||
"Creates a table of TokenTableFields"
|
||||
token_table = {}
|
||||
current_par = 0
|
||||
current_tok_seq = 0
|
||||
pb_iter = par_breaks.__iter__()
|
||||
current_par_break = next(pb_iter)
|
||||
if current_par_break == 0:
|
||||
try:
|
||||
current_par_break = next(pb_iter) # skip break at 0
|
||||
except StopIteration as e:
|
||||
raise ValueError(
|
||||
"No paragraph breaks were found(text too short perhaps?)"
|
||||
) from e
|
||||
for ts in token_sequences:
|
||||
for word, index in ts.wrdindex_list:
|
||||
try:
|
||||
while index > current_par_break:
|
||||
current_par_break = next(pb_iter)
|
||||
current_par += 1
|
||||
except StopIteration:
|
||||
# hit bottom
|
||||
pass
|
||||
|
||||
if word in token_table:
|
||||
token_table[word].total_count += 1
|
||||
|
||||
if token_table[word].last_par != current_par:
|
||||
token_table[word].last_par = current_par
|
||||
token_table[word].par_count += 1
|
||||
|
||||
if token_table[word].last_tok_seq != current_tok_seq:
|
||||
token_table[word].last_tok_seq = current_tok_seq
|
||||
token_table[word].ts_occurences.append([current_tok_seq, 1])
|
||||
else:
|
||||
token_table[word].ts_occurences[-1][1] += 1
|
||||
else: # new word
|
||||
token_table[word] = TokenTableField(
|
||||
first_pos=index,
|
||||
ts_occurences=[[current_tok_seq, 1]],
|
||||
total_count=1,
|
||||
par_count=1,
|
||||
last_par=current_par,
|
||||
last_tok_seq=current_tok_seq,
|
||||
)
|
||||
|
||||
current_tok_seq += 1
|
||||
|
||||
return token_table
|
||||
|
||||
def _identify_boundaries(self, depth_scores):
|
||||
"""Identifies boundaries at the peaks of similarity score
|
||||
differences"""
|
||||
|
||||
boundaries = [0 for x in depth_scores]
|
||||
|
||||
avg = sum(depth_scores) / len(depth_scores)
|
||||
stdev = numpy.std(depth_scores)
|
||||
|
||||
if self.cutoff_policy == LC:
|
||||
cutoff = avg - stdev
|
||||
else:
|
||||
cutoff = avg - stdev / 2.0
|
||||
|
||||
depth_tuples = sorted(zip(depth_scores, range(len(depth_scores))))
|
||||
depth_tuples.reverse()
|
||||
hp = list(filter(lambda x: x[0] > cutoff, depth_tuples))
|
||||
|
||||
for dt in hp:
|
||||
boundaries[dt[1]] = 1
|
||||
for dt2 in hp: # undo if there is a boundary close already
|
||||
if (
|
||||
dt[1] != dt2[1]
|
||||
and abs(dt2[1] - dt[1]) < 4
|
||||
and boundaries[dt2[1]] == 1
|
||||
):
|
||||
boundaries[dt[1]] = 0
|
||||
return boundaries
|
||||
|
||||
def _depth_scores(self, scores):
|
||||
"""Calculates the depth of each gap, i.e. the average difference
|
||||
between the left and right peaks and the gap's score"""
|
||||
|
||||
depth_scores = [0 for x in scores]
|
||||
# clip boundaries: this holds on the rule of thumb(my thumb)
|
||||
# that a section shouldn't be smaller than at least 2
|
||||
# pseudosentences for small texts and around 5 for larger ones.
|
||||
|
||||
clip = min(max(len(scores) // 10, 2), 5)
|
||||
index = clip
|
||||
|
||||
for gapscore in scores[clip:-clip]:
|
||||
lpeak = gapscore
|
||||
for score in scores[index::-1]:
|
||||
if score >= lpeak:
|
||||
lpeak = score
|
||||
else:
|
||||
break
|
||||
rpeak = gapscore
|
||||
for score in scores[index:]:
|
||||
if score >= rpeak:
|
||||
rpeak = score
|
||||
else:
|
||||
break
|
||||
depth_scores[index] = lpeak + rpeak - 2 * gapscore
|
||||
index += 1
|
||||
|
||||
return depth_scores
|
||||
|
||||
def _normalize_boundaries(self, text, boundaries, paragraph_breaks):
|
||||
"""Normalize the boundaries identified to the original text's
|
||||
paragraph breaks"""
|
||||
|
||||
norm_boundaries = []
|
||||
char_count, word_count, gaps_seen = 0, 0, 0
|
||||
seen_word = False
|
||||
|
||||
for char in text:
|
||||
char_count += 1
|
||||
if char in " \t\n" and seen_word:
|
||||
seen_word = False
|
||||
word_count += 1
|
||||
if char not in " \t\n" and not seen_word:
|
||||
seen_word = True
|
||||
if gaps_seen < len(boundaries) and word_count > (
|
||||
max(gaps_seen * self.w, self.w)
|
||||
):
|
||||
if boundaries[gaps_seen] == 1:
|
||||
# find closest paragraph break
|
||||
best_fit = len(text)
|
||||
for br in paragraph_breaks:
|
||||
if best_fit > abs(br - char_count):
|
||||
best_fit = abs(br - char_count)
|
||||
bestbr = br
|
||||
else:
|
||||
break
|
||||
if bestbr not in norm_boundaries: # avoid duplicates
|
||||
norm_boundaries.append(bestbr)
|
||||
gaps_seen += 1
|
||||
|
||||
return norm_boundaries
|
||||
|
||||
|
||||
class TokenTableField:
|
||||
"""A field in the token table holding parameters for each token,
|
||||
used later in the process"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
first_pos,
|
||||
ts_occurences,
|
||||
total_count=1,
|
||||
par_count=1,
|
||||
last_par=0,
|
||||
last_tok_seq=None,
|
||||
):
|
||||
self.__dict__.update(locals())
|
||||
del self.__dict__["self"]
|
||||
|
||||
|
||||
class TokenSequence:
|
||||
"A token list with its original length and its index"
|
||||
|
||||
def __init__(self, index, wrdindex_list, original_length=None):
|
||||
original_length = original_length or len(wrdindex_list)
|
||||
self.__dict__.update(locals())
|
||||
del self.__dict__["self"]
|
||||
|
||||
|
||||
# Pasted from the SciPy cookbook: https://www.scipy.org/Cookbook/SignalSmooth
|
||||
def smooth(x, window_len=11, window="flat"):
|
||||
"""smooth the data using a window with requested size.
|
||||
|
||||
This method is based on the convolution of a scaled window with the signal.
|
||||
The signal is prepared by introducing reflected copies of the signal
|
||||
(with the window size) in both ends so that transient parts are minimized
|
||||
in the beginning and end part of the output signal.
|
||||
|
||||
:param x: the input signal
|
||||
:param window_len: the dimension of the smoothing window; should be an odd integer
|
||||
:param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
|
||||
flat window will produce a moving average smoothing.
|
||||
|
||||
:return: the smoothed signal
|
||||
|
||||
example::
|
||||
|
||||
t=linspace(-2,2,0.1)
|
||||
x=sin(t)+randn(len(t))*0.1
|
||||
y=smooth(x)
|
||||
|
||||
:see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve,
|
||||
scipy.signal.lfilter
|
||||
|
||||
TODO: the window parameter could be the window itself if an array instead of a string
|
||||
"""
|
||||
|
||||
if x.ndim != 1:
|
||||
raise ValueError("smooth only accepts 1 dimension arrays.")
|
||||
|
||||
if x.size < window_len:
|
||||
raise ValueError("Input vector needs to be bigger than window size.")
|
||||
|
||||
if window_len < 3:
|
||||
return x
|
||||
|
||||
if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]:
|
||||
raise ValueError(
|
||||
"Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
|
||||
)
|
||||
|
||||
s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]]
|
||||
|
||||
# print(len(s))
|
||||
if window == "flat": # moving average
|
||||
w = numpy.ones(window_len, "d")
|
||||
else:
|
||||
w = eval("numpy." + window + "(window_len)")
|
||||
|
||||
y = numpy.convolve(w / w.sum(), s, mode="same")
|
||||
|
||||
return y[window_len - 1 : -window_len + 1]
|
||||
|
||||
|
||||
def demo(text=None):
|
||||
from matplotlib import pylab
|
||||
|
||||
from nltk.corpus import brown
|
||||
|
||||
tt = TextTilingTokenizer(demo_mode=True)
|
||||
if text is None:
|
||||
text = brown.raw()[:10000]
|
||||
s, ss, d, b = tt.tokenize(text)
|
||||
pylab.xlabel("Sentence Gap index")
|
||||
pylab.ylabel("Gap Scores")
|
||||
pylab.plot(range(len(s)), s, label="Gap Scores")
|
||||
pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
|
||||
pylab.plot(range(len(d)), d, label="Depth scores")
|
||||
pylab.stem(range(len(b)), b)
|
||||
pylab.legend()
|
||||
pylab.show()
|
||||
180
backend/venv/Lib/site-packages/nltk/tokenize/toktok.py
Normal file
180
backend/venv/Lib/site-packages/nltk/tokenize/toktok.py
Normal file
@@ -0,0 +1,180 @@
|
||||
# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer.
|
||||
#
|
||||
# Copyright (C) 2001-2015 NLTK Project
|
||||
# Author: Jon Dehdari
|
||||
# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters,
|
||||
# Alex Rudnick
|
||||
#
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The tok-tok tokenizer is a simple, general tokenizer, where the input has one
|
||||
sentence per line; thus only final period is tokenized.
|
||||
|
||||
Tok-tok has been tested on, and gives reasonably good results for English,
|
||||
Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
|
||||
The input should be in UTF-8 encoding.
|
||||
|
||||
Reference:
|
||||
Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
|
||||
Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
|
||||
class ToktokTokenizer(TokenizerI):
|
||||
"""
|
||||
This is a Python port of the tok-tok.pl from
|
||||
https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl
|
||||
|
||||
>>> toktok = ToktokTokenizer()
|
||||
>>> text = u'Is 9.5 or 525,600 my favorite number?'
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
Is 9.5 or 525,600 my favorite number ?
|
||||
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
|
||||
>>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
|
||||
>>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
|
||||
>>> assert toktok.tokenize(text, return_str=True) == expected
|
||||
>>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
|
||||
True
|
||||
"""
|
||||
|
||||
# Replace non-breaking spaces with normal spaces.
|
||||
NON_BREAKING = re.compile("\u00A0"), " "
|
||||
|
||||
# Pad some funky punctuation.
|
||||
FUNKY_PUNCT_1 = re.compile(r'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
|
||||
# Pad more funky punctuation.
|
||||
FUNKY_PUNCT_2 = re.compile(r"([({\[“‘„‚«‹「『])"), r" \1 "
|
||||
# Pad En dash and em dash
|
||||
EN_EM_DASHES = re.compile("([–—])"), r" \1 "
|
||||
|
||||
# Replace problematic character with numeric character reference.
|
||||
AMPERCENT = re.compile("& "), "& "
|
||||
TAB = re.compile("\t"), " 	 "
|
||||
PIPE = re.compile(r"\|"), " | "
|
||||
|
||||
# Pad numbers with commas to keep them from further tokenization.
|
||||
COMMA_IN_NUM = re.compile(r"(?<!,)([,،])(?![,\d])"), r" \1 "
|
||||
|
||||
# Just pad problematic (often neurotic) hyphen/single quote, etc.
|
||||
PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r" \1 "
|
||||
# Group ` ` stupid quotes ' ' into a single token.
|
||||
STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
|
||||
STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
|
||||
|
||||
# Don't tokenize period unless it ends the line and that it isn't
|
||||
# preceded by another period, e.g.
|
||||
# "something ..." -> "something ..."
|
||||
# "something." -> "something ."
|
||||
FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ."
|
||||
# Don't tokenize period unless it ends the line eg.
|
||||
# " ... stuff." -> "... stuff ."
|
||||
FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
|
||||
|
||||
# Treat continuous commas as fake German,Czech, etc.: „
|
||||
MULTI_COMMAS = re.compile(r"(,{2,})"), r" \1 "
|
||||
# Treat continuous dashes as fake en-dash, etc.
|
||||
MULTI_DASHES = re.compile(r"(-{2,})"), r" \1 "
|
||||
# Treat multiple periods as a thing (eg. ellipsis)
|
||||
MULTI_DOTS = re.compile(r"(\.{2,})"), r" \1 "
|
||||
|
||||
# This is the \p{Open_Punctuation} from Perl's perluniprops
|
||||
# see https://perldoc.perl.org/perluniprops.html
|
||||
OPEN_PUNCT = str(
|
||||
"([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d"
|
||||
"\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772"
|
||||
"\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983"
|
||||
"\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993"
|
||||
"\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26"
|
||||
"\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016"
|
||||
"\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39"
|
||||
"\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b"
|
||||
"\ufe5d\uff08\uff3b\uff5b\uff5f\uff62"
|
||||
)
|
||||
# This is the \p{Close_Punctuation} from Perl's perluniprops
|
||||
CLOSE_PUNCT = str(
|
||||
")]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a"
|
||||
"\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6"
|
||||
"\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988"
|
||||
"\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998"
|
||||
"\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009"
|
||||
"\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b"
|
||||
"\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c"
|
||||
"\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e"
|
||||
"\uff09\uff3d\uff5d\uff60\uff63"
|
||||
)
|
||||
# This is the \p{Close_Punctuation} from Perl's perluniprops
|
||||
CURRENCY_SYM = str(
|
||||
"$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb"
|
||||
"\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3"
|
||||
"\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab"
|
||||
"\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3"
|
||||
"\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838"
|
||||
"\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6"
|
||||
)
|
||||
|
||||
# Pad spaces after opening punctuations.
|
||||
OPEN_PUNCT_RE = re.compile(f"([{OPEN_PUNCT}])"), r"\1 "
|
||||
# Pad spaces before closing punctuations.
|
||||
CLOSE_PUNCT_RE = re.compile(f"([{CLOSE_PUNCT}])"), r"\1 "
|
||||
# Pad spaces after currency symbols.
|
||||
CURRENCY_SYM_RE = re.compile(f"([{CURRENCY_SYM}])"), r"\1 "
|
||||
|
||||
# Use for tokenizing URL-unfriendly characters: [:/?#]
|
||||
URL_FOE_1 = re.compile(r":(?!//)"), r" : " # in perl s{:(?!//)}{ : }g;
|
||||
URL_FOE_2 = re.compile(r"\?(?!\S)"), r" ? " # in perl s{\?(?!\S)}{ ? }g;
|
||||
# in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
|
||||
URL_FOE_3 = re.compile(r"(:\/\/)[\S+\.\S+\/\S+][\/]"), " / "
|
||||
URL_FOE_4 = re.compile(r" /"), r" / " # s{ /}{ / }g;
|
||||
|
||||
# Left/Right strip, i.e. remove heading/trailing spaces.
|
||||
# These strip regexes should NOT be used,
|
||||
# instead use str.lstrip(), str.rstrip() or str.strip()
|
||||
# (They are kept for reference purposes to the original toktok.pl code)
|
||||
LSTRIP = re.compile(r"^ +"), ""
|
||||
RSTRIP = re.compile(r"\s+$"), "\n"
|
||||
# Merge multiple spaces.
|
||||
ONE_SPACE = re.compile(r" {2,}"), " "
|
||||
|
||||
TOKTOK_REGEXES = [
|
||||
NON_BREAKING,
|
||||
FUNKY_PUNCT_1,
|
||||
FUNKY_PUNCT_2,
|
||||
URL_FOE_1,
|
||||
URL_FOE_2,
|
||||
URL_FOE_3,
|
||||
URL_FOE_4,
|
||||
AMPERCENT,
|
||||
TAB,
|
||||
PIPE,
|
||||
OPEN_PUNCT_RE,
|
||||
CLOSE_PUNCT_RE,
|
||||
MULTI_COMMAS,
|
||||
COMMA_IN_NUM,
|
||||
PROB_SINGLE_QUOTES,
|
||||
STUPID_QUOTES_1,
|
||||
STUPID_QUOTES_2,
|
||||
CURRENCY_SYM_RE,
|
||||
EN_EM_DASHES,
|
||||
MULTI_DASHES,
|
||||
MULTI_DOTS,
|
||||
FINAL_PERIOD_1,
|
||||
FINAL_PERIOD_2,
|
||||
ONE_SPACE,
|
||||
]
|
||||
|
||||
def tokenize(self, text, return_str=False):
|
||||
text = str(text) # Converts input string into unicode.
|
||||
for regexp, substitution in self.TOKTOK_REGEXES:
|
||||
text = regexp.sub(substitution, text)
|
||||
# Finally, strips heading and trailing spaces
|
||||
# and converts output string into unicode.
|
||||
text = str(text.strip())
|
||||
return text if return_str else text.split()
|
||||
402
backend/venv/Lib/site-packages/nltk/tokenize/treebank.py
Normal file
402
backend/venv/Lib/site-packages/nltk/tokenize/treebank.py
Normal file
@@ -0,0 +1,402 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
|
||||
# Tom Aarsen <> (modifications)
|
||||
#
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
r"""
|
||||
|
||||
Penn Treebank Tokenizer
|
||||
|
||||
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
|
||||
This implementation is a port of the tokenizer sed script written by Robert McIntyre
|
||||
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
|
||||
"""
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from typing import Iterator, List, Tuple
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.tokenize.destructive import MacIntyreContractions
|
||||
from nltk.tokenize.util import align_tokens
|
||||
|
||||
|
||||
class TreebankWordTokenizer(TokenizerI):
|
||||
r"""
|
||||
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
|
||||
|
||||
This tokenizer performs the following steps:
|
||||
|
||||
- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
|
||||
- treat most punctuation characters as separate tokens
|
||||
- split off commas and single quotes, when followed by whitespace
|
||||
- separate periods that appear at the end of line
|
||||
|
||||
>>> from nltk.tokenize import TreebankWordTokenizer
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.'''
|
||||
>>> TreebankWordTokenizer().tokenize(s)
|
||||
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
|
||||
>>> s = "They'll save and invest more."
|
||||
>>> TreebankWordTokenizer().tokenize(s)
|
||||
['They', "'ll", 'save', 'and', 'invest', 'more', '.']
|
||||
>>> s = "hi, my name can't hello,"
|
||||
>>> TreebankWordTokenizer().tokenize(s)
|
||||
['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
|
||||
"""
|
||||
|
||||
# starting quotes
|
||||
STARTING_QUOTES = [
|
||||
(re.compile(r"^\""), r"``"),
|
||||
(re.compile(r"(``)"), r" \1 "),
|
||||
(re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
|
||||
]
|
||||
|
||||
# punctuation
|
||||
PUNCTUATION = [
|
||||
(re.compile(r"([:,])([^\d])"), r" \1 \2"),
|
||||
(re.compile(r"([:,])$"), r" \1 "),
|
||||
(re.compile(r"\.\.\."), r" ... "),
|
||||
(re.compile(r"[;@#$%&]"), r" \g<0> "),
|
||||
(
|
||||
re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
|
||||
r"\1 \2\3 ",
|
||||
), # Handles the final period.
|
||||
(re.compile(r"[?!]"), r" \g<0> "),
|
||||
(re.compile(r"([^'])' "), r"\1 ' "),
|
||||
]
|
||||
|
||||
# Pads parentheses
|
||||
PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
|
||||
|
||||
# Optionally: Convert parentheses, brackets and converts them to PTB symbols.
|
||||
CONVERT_PARENTHESES = [
|
||||
(re.compile(r"\("), "-LRB-"),
|
||||
(re.compile(r"\)"), "-RRB-"),
|
||||
(re.compile(r"\["), "-LSB-"),
|
||||
(re.compile(r"\]"), "-RSB-"),
|
||||
(re.compile(r"\{"), "-LCB-"),
|
||||
(re.compile(r"\}"), "-RCB-"),
|
||||
]
|
||||
|
||||
DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
|
||||
|
||||
# ending quotes
|
||||
ENDING_QUOTES = [
|
||||
(re.compile(r"''"), " '' "),
|
||||
(re.compile(r'"'), " '' "),
|
||||
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
|
||||
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
|
||||
]
|
||||
|
||||
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
||||
_contractions = MacIntyreContractions()
|
||||
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
|
||||
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
|
||||
|
||||
def tokenize(
|
||||
self, text: str, convert_parentheses: bool = False, return_str: bool = False
|
||||
) -> List[str]:
|
||||
r"""Return a tokenized copy of `text`.
|
||||
|
||||
>>> from nltk.tokenize import TreebankWordTokenizer
|
||||
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
|
||||
>>> TreebankWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
|
||||
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
||||
'of', 'them.', 'Thanks', '.']
|
||||
>>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
|
||||
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
||||
'of', 'them.', 'Thanks', '.']
|
||||
|
||||
:param text: A string with a sentence or sentences.
|
||||
:type text: str
|
||||
:param convert_parentheses: if True, replace parentheses to PTB symbols,
|
||||
e.g. `(` to `-LRB-`. Defaults to False.
|
||||
:type convert_parentheses: bool, optional
|
||||
:param return_str: If True, return tokens as space-separated string,
|
||||
defaults to False.
|
||||
:type return_str: bool, optional
|
||||
:return: List of tokens from `text`.
|
||||
:rtype: List[str]
|
||||
"""
|
||||
if return_str is not False:
|
||||
warnings.warn(
|
||||
"Parameter 'return_str' has been deprecated and should no "
|
||||
"longer be used.",
|
||||
category=DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
for regexp, substitution in self.STARTING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
for regexp, substitution in self.PUNCTUATION:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Handles parentheses.
|
||||
regexp, substitution = self.PARENS_BRACKETS
|
||||
text = regexp.sub(substitution, text)
|
||||
# Optionally convert parentheses
|
||||
if convert_parentheses:
|
||||
for regexp, substitution in self.CONVERT_PARENTHESES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Handles double dash.
|
||||
regexp, substitution = self.DOUBLE_DASHES
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# add extra space to make things easier
|
||||
text = " " + text + " "
|
||||
|
||||
for regexp, substitution in self.ENDING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
for regexp in self.CONTRACTIONS2:
|
||||
text = regexp.sub(r" \1 \2 ", text)
|
||||
for regexp in self.CONTRACTIONS3:
|
||||
text = regexp.sub(r" \1 \2 ", text)
|
||||
|
||||
# We are not using CONTRACTIONS4 since
|
||||
# they are also commented out in the SED scripts
|
||||
# for regexp in self._contractions.CONTRACTIONS4:
|
||||
# text = regexp.sub(r' \1 \2 \3 ', text)
|
||||
|
||||
return text.split()
|
||||
|
||||
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
|
||||
r"""
|
||||
Returns the spans of the tokens in ``text``.
|
||||
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
|
||||
|
||||
>>> from nltk.tokenize import TreebankWordTokenizer
|
||||
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
|
||||
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
|
||||
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
|
||||
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
|
||||
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
|
||||
>>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
|
||||
True
|
||||
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
|
||||
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
|
||||
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
|
||||
>>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
|
||||
True
|
||||
|
||||
:param text: A string with a sentence or sentences.
|
||||
:type text: str
|
||||
:yield: Tuple[int, int]
|
||||
"""
|
||||
raw_tokens = self.tokenize(text)
|
||||
|
||||
# Convert converted quotes back to original double quotes
|
||||
# Do this only if original text contains double quote(s) or double
|
||||
# single-quotes (because '' might be transformed to `` if it is
|
||||
# treated as starting quotes).
|
||||
if ('"' in text) or ("''" in text):
|
||||
# Find double quotes and converted quotes
|
||||
matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
|
||||
|
||||
# Replace converted quotes back to double quotes
|
||||
tokens = [
|
||||
matched.pop(0) if tok in ['"', "``", "''"] else tok
|
||||
for tok in raw_tokens
|
||||
]
|
||||
else:
|
||||
tokens = raw_tokens
|
||||
|
||||
yield from align_tokens(tokens, text)
|
||||
|
||||
|
||||
class TreebankWordDetokenizer(TokenizerI):
|
||||
r"""
|
||||
The Treebank detokenizer uses the reverse regex operations corresponding to
|
||||
the Treebank tokenizer's regexes.
|
||||
|
||||
Note:
|
||||
|
||||
- There're additional assumption mades when undoing the padding of ``[;@#$%&]``
|
||||
punctuation symbols that isn't presupposed in the TreebankTokenizer.
|
||||
- There're additional regexes added in reversing the parentheses tokenization,
|
||||
such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
|
||||
padding added to the closing parentheses precedding ``[:;,.]``.
|
||||
- It's not possible to return the original whitespaces as they were because
|
||||
there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
|
||||
the text.split() operation.
|
||||
|
||||
>>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.'''
|
||||
>>> d = TreebankWordDetokenizer()
|
||||
>>> t = TreebankWordTokenizer()
|
||||
>>> toks = t.tokenize(s)
|
||||
>>> d.detokenize(toks)
|
||||
'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'
|
||||
|
||||
The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
|
||||
parameter:
|
||||
|
||||
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
|
||||
>>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
|
||||
... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
|
||||
... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
|
||||
>>> expected_tokens == t.tokenize(s, convert_parentheses=True)
|
||||
True
|
||||
>>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
|
||||
>>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
|
||||
True
|
||||
|
||||
During tokenization it's safe to add more spaces but during detokenization,
|
||||
simply undoing the padding doesn't really help.
|
||||
|
||||
- During tokenization, left and right pad is added to ``[!?]``, when
|
||||
detokenizing, only left shift the ``[!?]`` is needed.
|
||||
Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.
|
||||
|
||||
- During tokenization ``[:,]`` are left and right padded but when detokenizing,
|
||||
only left shift is necessary and we keep right pad after comma/colon
|
||||
if the string after is a non-digit.
|
||||
Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.
|
||||
|
||||
>>> from nltk.tokenize.treebank import TreebankWordDetokenizer
|
||||
>>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
|
||||
>>> twd = TreebankWordDetokenizer()
|
||||
>>> twd.detokenize(toks)
|
||||
"hello, i can't feel my feet! Help!!"
|
||||
|
||||
>>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
|
||||
... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
|
||||
>>> twd.detokenize(toks)
|
||||
"hello, i can't feel; my feet! Help!! He said: Help, help?!"
|
||||
"""
|
||||
|
||||
_contractions = MacIntyreContractions()
|
||||
CONTRACTIONS2 = [
|
||||
re.compile(pattern.replace("(?#X)", r"\s"))
|
||||
for pattern in _contractions.CONTRACTIONS2
|
||||
]
|
||||
CONTRACTIONS3 = [
|
||||
re.compile(pattern.replace("(?#X)", r"\s"))
|
||||
for pattern in _contractions.CONTRACTIONS3
|
||||
]
|
||||
|
||||
# ending quotes
|
||||
ENDING_QUOTES = [
|
||||
(re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
|
||||
(re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
|
||||
(re.compile(r"(\S)\s(\'\')"), r"\1\2"),
|
||||
(
|
||||
re.compile(r"(\'\')\s([.,:)\]>};%])"),
|
||||
r"\1\2",
|
||||
), # Quotes followed by no-left-padded punctuations.
|
||||
(re.compile(r"''"), '"'),
|
||||
]
|
||||
|
||||
# Handles double dashes
|
||||
DOUBLE_DASHES = (re.compile(r" -- "), r"--")
|
||||
|
||||
# Optionally: Convert parentheses, brackets and converts them from PTB symbols.
|
||||
CONVERT_PARENTHESES = [
|
||||
(re.compile("-LRB-"), "("),
|
||||
(re.compile("-RRB-"), ")"),
|
||||
(re.compile("-LSB-"), "["),
|
||||
(re.compile("-RSB-"), "]"),
|
||||
(re.compile("-LCB-"), "{"),
|
||||
(re.compile("-RCB-"), "}"),
|
||||
]
|
||||
|
||||
# Undo padding on parentheses.
|
||||
PARENS_BRACKETS = [
|
||||
(re.compile(r"([\[\(\{\<])\s"), r"\g<1>"),
|
||||
(re.compile(r"\s([\]\)\}\>])"), r"\g<1>"),
|
||||
(re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"),
|
||||
]
|
||||
|
||||
# punctuation
|
||||
PUNCTUATION = [
|
||||
(re.compile(r"([^'])\s'\s"), r"\1' "),
|
||||
(re.compile(r"\s([?!])"), r"\g<1>"), # Strip left pad for [?!]
|
||||
# (re.compile(r'\s([?!])\s'), r'\g<1>'),
|
||||
(re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"),
|
||||
# When tokenizing, [;@#$%&] are padded with whitespace regardless of
|
||||
# whether there are spaces before or after them.
|
||||
# But during detokenization, we need to distinguish between left/right
|
||||
# pad, so we split this up.
|
||||
(re.compile(r"([#$])\s"), r"\g<1>"), # Left pad.
|
||||
(re.compile(r"\s([;%])"), r"\g<1>"), # Right pad.
|
||||
# (re.compile(r"\s([&*])\s"), r" \g<1> "), # Unknown pad.
|
||||
(re.compile(r"\s\.\.\.\s"), r"..."),
|
||||
# (re.compile(r"\s([:,])\s$"), r"\1"), # .strip() takes care of it.
|
||||
(
|
||||
re.compile(r"\s([:,])"),
|
||||
r"\1",
|
||||
), # Just remove left padding. Punctuation in numbers won't be padded.
|
||||
]
|
||||
|
||||
# starting quotes
|
||||
STARTING_QUOTES = [
|
||||
(re.compile(r"([ (\[{<])\s``"), r"\1``"),
|
||||
(re.compile(r"(``)\s"), r"\1"),
|
||||
(re.compile(r"``"), r'"'),
|
||||
]
|
||||
|
||||
def tokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
|
||||
"""
|
||||
Treebank detokenizer, created by undoing the regexes from
|
||||
the TreebankWordTokenizer.tokenize.
|
||||
|
||||
:param tokens: A list of strings, i.e. tokenized text.
|
||||
:type tokens: List[str]
|
||||
:param convert_parentheses: if True, replace PTB symbols with parentheses,
|
||||
e.g. `-LRB-` to `(`. Defaults to False.
|
||||
:type convert_parentheses: bool, optional
|
||||
:return: str
|
||||
"""
|
||||
text = " ".join(tokens)
|
||||
|
||||
# Add extra space to make things easier
|
||||
text = " " + text + " "
|
||||
|
||||
# Reverse the contractions regexes.
|
||||
# Note: CONTRACTIONS4 are not used in tokenization.
|
||||
for regexp in self.CONTRACTIONS3:
|
||||
text = regexp.sub(r"\1\2", text)
|
||||
for regexp in self.CONTRACTIONS2:
|
||||
text = regexp.sub(r"\1\2", text)
|
||||
|
||||
# Reverse the regexes applied for ending quotes.
|
||||
for regexp, substitution in self.ENDING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Undo the space padding.
|
||||
text = text.strip()
|
||||
|
||||
# Reverse the padding on double dashes.
|
||||
regexp, substitution = self.DOUBLE_DASHES
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
if convert_parentheses:
|
||||
for regexp, substitution in self.CONVERT_PARENTHESES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Reverse the padding regexes applied for parenthesis/brackets.
|
||||
for regexp, substitution in self.PARENS_BRACKETS:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Reverse the regexes applied for punctuations.
|
||||
for regexp, substitution in self.PUNCTUATION:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Reverse the regexes applied for starting quotes.
|
||||
for regexp, substitution in self.STARTING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def detokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
|
||||
"""Duck-typing the abstract *tokenize()*."""
|
||||
return self.tokenize(tokens, convert_parentheses)
|
||||
295
backend/venv/Lib/site-packages/nltk/tokenize/util.py
Normal file
295
backend/venv/Lib/site-packages/nltk/tokenize/util.py
Normal file
@@ -0,0 +1,295 @@
|
||||
# Natural Language Toolkit: Tokenizer Utilities
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from re import finditer
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
|
||||
|
||||
def string_span_tokenize(s, sep):
|
||||
r"""
|
||||
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
|
||||
tuples, by splitting the string at each occurrence of *sep*.
|
||||
|
||||
>>> from nltk.tokenize.util import string_span_tokenize
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||||
... two of them.\n\nThanks.'''
|
||||
>>> list(string_span_tokenize(s, " ")) # doctest: +NORMALIZE_WHITESPACE
|
||||
[(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37),
|
||||
(38, 44), (45, 48), (49, 55), (56, 58), (59, 73)]
|
||||
|
||||
:param s: the string to be tokenized
|
||||
:type s: str
|
||||
:param sep: the token separator
|
||||
:type sep: str
|
||||
:rtype: iter(tuple(int, int))
|
||||
"""
|
||||
if len(sep) == 0:
|
||||
raise ValueError("Token delimiter must not be empty")
|
||||
left = 0
|
||||
while True:
|
||||
try:
|
||||
right = s.index(sep, left)
|
||||
if right != 0:
|
||||
yield left, right
|
||||
except ValueError:
|
||||
if left != len(s):
|
||||
yield left, len(s)
|
||||
break
|
||||
|
||||
left = right + len(sep)
|
||||
|
||||
|
||||
def regexp_span_tokenize(s, regexp):
|
||||
r"""
|
||||
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
|
||||
tuples, by splitting the string at each successive match of *regexp*.
|
||||
|
||||
>>> from nltk.tokenize.util import regexp_span_tokenize
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||||
... two of them.\n\nThanks.'''
|
||||
>>> list(regexp_span_tokenize(s, r'\s')) # doctest: +NORMALIZE_WHITESPACE
|
||||
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36),
|
||||
(38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
|
||||
|
||||
:param s: the string to be tokenized
|
||||
:type s: str
|
||||
:param regexp: regular expression that matches token separators (must not be empty)
|
||||
:type regexp: str
|
||||
:rtype: iter(tuple(int, int))
|
||||
"""
|
||||
left = 0
|
||||
for m in finditer(regexp, s):
|
||||
right, next = m.span()
|
||||
if right != left:
|
||||
yield left, right
|
||||
left = next
|
||||
yield left, len(s)
|
||||
|
||||
|
||||
def spans_to_relative(spans):
|
||||
r"""
|
||||
Return a sequence of relative spans, given a sequence of spans.
|
||||
|
||||
>>> from nltk.tokenize import WhitespaceTokenizer
|
||||
>>> from nltk.tokenize.util import spans_to_relative
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||||
... two of them.\n\nThanks.'''
|
||||
>>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) # doctest: +NORMALIZE_WHITESPACE
|
||||
[(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
|
||||
(1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
|
||||
|
||||
:param spans: a sequence of (start, end) offsets of the tokens
|
||||
:type spans: iter(tuple(int, int))
|
||||
:rtype: iter(tuple(int, int))
|
||||
"""
|
||||
prev = 0
|
||||
for left, right in spans:
|
||||
yield left - prev, right - left
|
||||
prev = right
|
||||
|
||||
|
||||
class CJKChars:
|
||||
"""
|
||||
An object that enumerates the code points of the CJK characters as listed on
|
||||
https://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
|
||||
|
||||
This is a Python port of the CJK code point enumerations of Moses tokenizer:
|
||||
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
|
||||
"""
|
||||
|
||||
# Hangul Jamo (1100–11FF)
|
||||
Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
|
||||
|
||||
# CJK Radicals Supplement (2E80–2EFF)
|
||||
# Kangxi Radicals (2F00–2FDF)
|
||||
# Ideographic Description Characters (2FF0–2FFF)
|
||||
# CJK Symbols and Punctuation (3000–303F)
|
||||
# Hiragana (3040–309F)
|
||||
# Katakana (30A0–30FF)
|
||||
# Bopomofo (3100–312F)
|
||||
# Hangul Compatibility Jamo (3130–318F)
|
||||
# Kanbun (3190–319F)
|
||||
# Bopomofo Extended (31A0–31BF)
|
||||
# CJK Strokes (31C0–31EF)
|
||||
# Katakana Phonetic Extensions (31F0–31FF)
|
||||
# Enclosed CJK Letters and Months (3200–32FF)
|
||||
# CJK Compatibility (3300–33FF)
|
||||
# CJK Unified Ideographs Extension A (3400–4DBF)
|
||||
# Yijing Hexagram Symbols (4DC0–4DFF)
|
||||
# CJK Unified Ideographs (4E00–9FFF)
|
||||
# Yi Syllables (A000–A48F)
|
||||
# Yi Radicals (A490–A4CF)
|
||||
CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
|
||||
|
||||
# Phags-pa (A840–A87F)
|
||||
Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
|
||||
|
||||
# Hangul Syllables (AC00–D7AF)
|
||||
Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
|
||||
|
||||
# CJK Compatibility Ideographs (F900–FAFF)
|
||||
CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
|
||||
|
||||
# CJK Compatibility Forms (FE30–FE4F)
|
||||
CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
|
||||
|
||||
# Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
|
||||
Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
|
||||
|
||||
# Supplementary Ideographic Plane 20000–2FFFF
|
||||
Supplementary_Ideographic_Plane = (
|
||||
131072,
|
||||
196607,
|
||||
) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
|
||||
|
||||
ranges = [
|
||||
Hangul_Jamo,
|
||||
CJK_Radicals,
|
||||
Phags_Pa,
|
||||
Hangul_Syllables,
|
||||
CJK_Compatibility_Ideographs,
|
||||
CJK_Compatibility_Forms,
|
||||
Katakana_Hangul_Halfwidth,
|
||||
Supplementary_Ideographic_Plane,
|
||||
]
|
||||
|
||||
|
||||
def is_cjk(character):
|
||||
"""
|
||||
Python port of Moses' code to check for CJK character.
|
||||
|
||||
>>> CJKChars().ranges
|
||||
[(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
|
||||
>>> is_cjk(u'\u33fe')
|
||||
True
|
||||
>>> is_cjk(u'\uFE5F')
|
||||
False
|
||||
|
||||
:param character: The character that needs to be checked.
|
||||
:type character: char
|
||||
:return: bool
|
||||
"""
|
||||
return any(
|
||||
[
|
||||
start <= ord(character) <= end
|
||||
for start, end in [
|
||||
(4352, 4607),
|
||||
(11904, 42191),
|
||||
(43072, 43135),
|
||||
(44032, 55215),
|
||||
(63744, 64255),
|
||||
(65072, 65103),
|
||||
(65381, 65500),
|
||||
(131072, 196607),
|
||||
]
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def xml_escape(text):
|
||||
"""
|
||||
This function transforms the input text into an "escaped" version suitable
|
||||
for well-formed XML formatting.
|
||||
|
||||
Note that the default xml.sax.saxutils.escape() function don't escape
|
||||
some characters that Moses does so we have to manually add them to the
|
||||
entities dictionary.
|
||||
|
||||
>>> input_str = ''')| & < > ' " ] ['''
|
||||
>>> expected_output = ''')| & < > ' " ] ['''
|
||||
>>> escape(input_str) == expected_output
|
||||
True
|
||||
>>> xml_escape(input_str)
|
||||
')| & < > ' " ] ['
|
||||
|
||||
:param text: The text that needs to be escaped.
|
||||
:type text: str
|
||||
:rtype: str
|
||||
"""
|
||||
return escape(
|
||||
text,
|
||||
entities={
|
||||
r"'": r"'",
|
||||
r'"': r""",
|
||||
r"|": r"|",
|
||||
r"[": r"[",
|
||||
r"]": r"]",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def xml_unescape(text):
|
||||
"""
|
||||
This function transforms the "escaped" version suitable
|
||||
for well-formed XML formatting into humanly-readable string.
|
||||
|
||||
Note that the default xml.sax.saxutils.unescape() function don't unescape
|
||||
some characters that Moses does so we have to manually add them to the
|
||||
entities dictionary.
|
||||
|
||||
>>> from xml.sax.saxutils import unescape
|
||||
>>> s = ')| & < > ' " ] ['
|
||||
>>> expected = ''')| & < > \' " ] ['''
|
||||
>>> xml_unescape(s) == expected
|
||||
True
|
||||
|
||||
:param text: The text that needs to be unescaped.
|
||||
:type text: str
|
||||
:rtype: str
|
||||
"""
|
||||
return unescape(
|
||||
text,
|
||||
entities={
|
||||
r"'": r"'",
|
||||
r""": r'"',
|
||||
r"|": r"|",
|
||||
r"[": r"[",
|
||||
r"]": r"]",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def align_tokens(tokens, sentence):
|
||||
"""
|
||||
This module attempt to find the offsets of the tokens in *s*, as a sequence
|
||||
of ``(start, end)`` tuples, given the tokens and also the source string.
|
||||
|
||||
>>> from nltk.tokenize import TreebankWordTokenizer
|
||||
>>> from nltk.tokenize.util import align_tokens
|
||||
>>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
|
||||
... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
|
||||
... "on Saturday.")
|
||||
>>> tokens = TreebankWordTokenizer().tokenize(s)
|
||||
>>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
|
||||
... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
|
||||
... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
|
||||
... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
|
||||
... (123, 131), (131, 132)]
|
||||
>>> output = list(align_tokens(tokens, s))
|
||||
>>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same.
|
||||
True
|
||||
>>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected.
|
||||
True
|
||||
>>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens.
|
||||
True
|
||||
|
||||
:param tokens: The list of strings that are the result of tokenization
|
||||
:type tokens: list(str)
|
||||
:param sentence: The original string
|
||||
:type sentence: str
|
||||
:rtype: list(tuple(int,int))
|
||||
"""
|
||||
point = 0
|
||||
offsets = []
|
||||
for token in tokens:
|
||||
try:
|
||||
start = sentence.index(token, point)
|
||||
except ValueError as e:
|
||||
raise ValueError(f'substring "{token}" not found in "{sentence}"') from e
|
||||
point = start + len(token)
|
||||
offsets.append((start, point))
|
||||
return offsets
|
||||
Reference in New Issue
Block a user