2026-02-01 09:31:38 +01:00

57 lines
1.5 KiB
Python

# -*- coding: utf-8 -*-
import re
import string
PUNCTUATION_REGEX = re.compile('[{0}]'.format(re.escape(string.punctuation)))
def strip_punc(s, all=False):
"""Removes punctuation from a string.
:param s: The string.
:param all: Remove all punctuation. If False, only removes punctuation from
the ends of the string.
"""
if all:
return PUNCTUATION_REGEX.sub('', s.strip())
else:
return s.strip().strip(string.punctuation)
def lowerstrip(s, all=False):
"""Makes text all lowercase and strips punctuation and whitespace.
:param s: The string.
:param all: Remove all punctuation. If False, only removes punctuation from
the ends of the string.
"""
return strip_punc(s.lower().strip(), all=all)
def tree2str(tree, concat=' '):
"""Convert a nltk.tree.Tree to a string.
For example:
(NP a/DT beautiful/JJ new/JJ dashboard/NN) -> "a beautiful dashboard"
"""
return concat.join([word for (word, tag) in tree])
def filter_insignificant(chunk, tag_suffixes=('DT', 'CC', 'PRP$', 'PRP')):
"""Filter out insignificant (word, tag) tuples from a chunk of text."""
good = []
for word, tag in chunk:
ok = True
for suffix in tag_suffixes:
if tag.endswith(suffix):
ok = False
break
if ok:
good.append((word, tag))
return good
def is_filelike(obj):
"""Return whether ``obj`` is a file-like object."""
return hasattr(obj, 'read')