Browse Source

+ filtering

pull/8/head
Taras Labiak 4 years ago
parent
commit
d4932610b8
5 changed files with 69 additions and 8 deletions
  1. 2
    1
      .gitignore
  2. 2
    1
      ChangeLog.rst
  3. 47
    1
      stop_words/__init__.py
  4. 1
    1
      stop_words/stop-words
  5. 17
    4
      stop_words/tests.py

+ 2
- 1
.gitignore View File

@@ -15,4 +15,5 @@ src/
.c9/
bin/
develop-eggs/
eggs/
eggs/
stop_words/stop-words/

+ 2
- 1
ChangeLog.rst View File

@@ -1,8 +1,9 @@
2015.2.22
2015.2.23
=========
----

* Feature: Using the cache is optional
* Feature: Filtering stopwords

2015.2.21
=========

+ 47
- 1
stop_words/__init__.py View File

@@ -11,7 +11,7 @@ with open(os.path.join(STOP_WORDS_DIR, 'languages.json'), 'rb') as map_file:
buffer = buffer.decode('ascii')
LANGUAGE_MAPPING = json.loads(buffer)

AVAILABLE_LANGUAGES = LANGUAGE_MAPPING.values()
AVAILABLE_LANGUAGES = list(LANGUAGE_MAPPING.values())


def get_version():
@@ -47,6 +47,7 @@ def get_stop_words(language, cache=True):
with open(language_filename, 'rb') as language_file:
stop_words = [line.decode('utf-8').strip()
for line in language_file.readlines()]
stop_words = apply_filters(stop_words, language)
except IOError:
raise StopWordError(
'{0}" file is unreadable, check your installation.'.format(
@@ -59,6 +60,51 @@ def get_stop_words(language, cache=True):

return stop_words

_filters = {None: []}


def apply_filters(stopwords, language):
"""
Apply registered filters to stopwords
:param stopwords: list
:param language: string
:return: filtered stopwords
"""
if language in _filters:
for func in _filters[language]:
stopwords = func(stopwords)

for func in _filters[None]:
stopwords = func(stopwords, language)

return stopwords


def add_filter(func, language=None):
"""
Register filters for specific language.
If language == None the filter applies for all languages.
Filter will not apply for stop words in cache.
:param func: callable
:param language: string|None
:return:
"""
if not language in _filters:
_filters[language] = []
_filters[language].append(func)


def remove_filter(func, language=None):
"""
:param func:
:param language:
:return:
"""
if not (language in _filters and func in _filters[language]):
return False
_filters[language].remove(func)
return True


def safe_get_stop_words(language):
"""

+ 1
- 1
stop_words/stop-words

@@ -1 +1 @@
Subproject commit 25c6a0aea665871e887f155b883e950c3743ce50
Subproject commit 56cfbd788c846404d1cf5793c8b823ad794d3c8d

+ 17
- 4
stop_words/tests.py View File

@@ -10,7 +10,6 @@ import stop_words
from stop_words import get_stop_words
from stop_words import safe_get_stop_words
from stop_words import StopWordError
from stop_words import STOP_WORDS_CACHE
from stop_words import LANGUAGE_MAPPING
from stop_words import AVAILABLE_LANGUAGES

@@ -28,9 +27,9 @@ class StopWordsTestCase(TestCase):
self.assertEqual(sw, get_stop_words('english'))

def test_get_stop_words_cache(self):
self.assertFalse('french' in STOP_WORDS_CACHE)
self.assertFalse('french' in stop_words.STOP_WORDS_CACHE)
sw = get_stop_words('fr')
self.assertTrue('french' in STOP_WORDS_CACHE)
self.assertTrue('french' in stop_words.STOP_WORDS_CACHE)
original_stop_words_dir = stop_words.STOP_WORDS_DIR
stop_words.STOP_WORDS_DIR = 'not-existing-directory'
self.assertEqual(sw, get_stop_words('french'))
@@ -39,7 +38,7 @@ class StopWordsTestCase(TestCase):
get_stop_words('klingon')
except:
pass
self.assertFalse('klingon' in STOP_WORDS_CACHE)
self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE)

def test_get_stop_words_unavailable_language(self):
self.assertRaises(StopWordError, get_stop_words, 'sindarin')
@@ -64,6 +63,20 @@ class StopWordsTestCase(TestCase):
'Cannot load stopwords for {0} language'.format(language)
)

def test_filters(self):
language = 'en'
before = get_stop_words(language, False)
letter = random.choice(random.choice(before))

def remove_letter(stopwords, language):
return [word for word in stopwords if letter not in word]
stop_words.add_filter(remove_letter)
after = get_stop_words(language, False)
for stopword in after:
self.assertFalse(letter in stopword)
self.assertTrue(stop_words.remove_filter(remove_letter))


loader = TestLoader()

test_suite = TestSuite(

Loading…
Cancel
Save