From 209b6d3eec9e2ca617ffba3aacd649434ac88b28 Mon Sep 17 00:00:00 2001 From: Shengyu Zhang Date: Thu, 30 Oct 2025 20:50:39 +0800 Subject: [PATCH] chore: Drop stopwords related logic --- pyproject.toml | 2 -- src/sphinxnotes/snippet/ext.py | 4 ++-- src/sphinxnotes/snippet/keyword.py | 16 +--------------- 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1fe0301..2251a7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,8 +48,6 @@ dependencies = [ "jieba-next", "python-pinyin", "pyxdg", - "stopwordsiso", - "setuptools", # req by stopwordsiso, https://stackoverflow.com/a/39930983/4799273 "wcwidth", "wordsegment", # CUSTOM DEPENDENCIES END diff --git a/src/sphinxnotes/snippet/ext.py b/src/sphinxnotes/snippet/ext.py index 178c98c..9941038 100644 --- a/src/sphinxnotes/snippet/ext.py +++ b/src/sphinxnotes/snippet/ext.py @@ -63,9 +63,9 @@ def extract_excerpt(s: Snippet) -> str: def extract_keywords(s: Snippet) -> list[str]: keywords = [s.docname] if isinstance(s, WithTitle) and s.title is not None: - keywords.extend(extractor.extract(s.title, strip_stopwords=False)) + keywords.extend(extractor.extract(s.title)) if isinstance(s, Code): - keywords.extend(extractor.extract(s.desc, strip_stopwords=False)) + keywords.extend(extractor.extract(s.desc)) return keywords diff --git a/src/sphinxnotes/snippet/keyword.py b/src/sphinxnotes/snippet/keyword.py index 3cb4313..ef262b6 100644 --- a/src/sphinxnotes/snippet/keyword.py +++ b/src/sphinxnotes/snippet/keyword.py @@ -26,7 +26,6 @@ def __init__(self): from langid import rank from jieba_next import cut_for_search, setLogLevel from pypinyin import lazy_pinyin - from stopwordsiso import stopwords from wordsegment import load, segment # Turn off jieba debug log. @@ -38,16 +37,13 @@ def __init__(self): self._tokenize_zh_cn = cut_for_search self._tokenize_en = segment self._pinyin = lazy_pinyin - self._stopwords = stopwords self._punctuation = ( string.punctuation + '!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.·' ) - def extract( - self, text: str, top_n: int | None = None, strip_stopwords: bool = True - ) -> list[str]: + def extract(self, text: str, top_n: int | None = None) -> list[str]: """Return keywords of given text.""" # TODO: zh -> en # Normalize @@ -57,8 +53,6 @@ def extract( # Invalid token removal words = self.strip_invalid_token(words) # Stopwords removal - if strip_stopwords: - words = self.strip_stopwords(words) if top_n: # Get top n words as keyword keywords = Counter(words).most_common(top_n) @@ -106,13 +100,5 @@ def tokenize(self, text: str) -> list[str]: def trans_to_pinyin(self, word: str) -> str | None: return ' '.join(self._pinyin(word, errors='ignore')) - def strip_stopwords(self, words: list[str]) -> list[str]: - stw = self._stopwords(['en', 'zh']) - new_words = [] - for word in words: - if word not in stw: - new_words.append(word) - return new_words - def strip_invalid_token(self, tokens: list[str]) -> list[str]: return [token for token in tokens if token != '']