Get list of common stop words in various languages in Python
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tests.py 3.0KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. """
  2. Tests for stop-words
  3. """
  4. import random
  5. from unittest import TestCase
  6. from unittest import TestSuite
  7. from unittest import TestLoader
  8. import stop_words
  9. from stop_words import get_stop_words
  10. from stop_words import safe_get_stop_words
  11. from stop_words import StopWordError
  12. from stop_words import LANGUAGE_MAPPING
  13. from stop_words import AVAILABLE_LANGUAGES
  14. class StopWordsTestCase(TestCase):
  15. number_of_english_stop_words = 174
  16. def test_get_stop_words(self):
  17. sw = get_stop_words('english')
  18. self.assertEqual(len(sw), self.number_of_english_stop_words)
  19. def test_get_stop_words_language_mapping(self):
  20. sw = get_stop_words('en')
  21. self.assertEqual(len(sw), self.number_of_english_stop_words)
  22. self.assertEqual(sw, get_stop_words('english'))
  23. def test_get_stop_words_cache(self):
  24. self.assertFalse('french' in stop_words.STOP_WORDS_CACHE)
  25. sw = get_stop_words('fr')
  26. self.assertTrue('french' in stop_words.STOP_WORDS_CACHE)
  27. original_stop_words_dir = stop_words.STOP_WORDS_DIR
  28. stop_words.STOP_WORDS_DIR = 'not-existing-directory'
  29. self.assertEqual(sw, get_stop_words('french'))
  30. stop_words.STOP_WORDS_DIR = original_stop_words_dir
  31. try:
  32. get_stop_words('klingon')
  33. except:
  34. pass
  35. self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE)
  36. def test_get_stop_words_unavailable_language(self):
  37. self.assertRaises(StopWordError, get_stop_words, 'sindarin')
  38. def test_get_stop_words_install_issue(self):
  39. original_stop_words_dir = stop_words.STOP_WORDS_DIR
  40. stop_words.STOP_WORDS_DIR = 'not-existing-directory'
  41. self.assertRaises(StopWordError, get_stop_words, 'german')
  42. stop_words.STOP_WORDS_DIR = original_stop_words_dir
  43. def test_safe_get_stop_words(self):
  44. self.assertRaises(StopWordError, get_stop_words, 'huttese')
  45. self.assertEqual(safe_get_stop_words('huttese'), [])
  46. def test_random_language_stop_words_load(self):
  47. languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES)
  48. sample = random.sample(languages, len(languages))
  49. for language in sample:
  50. stop_words = safe_get_stop_words(language)
  51. self.assertTrue(
  52. len(stop_words) > 0,
  53. 'Cannot load stopwords for {0} language'.format(language)
  54. )
  55. def test_filters(self):
  56. language = 'en'
  57. before = get_stop_words(language, False)
  58. letter = random.choice(random.choice(before))
  59. def remove_letter(stopwords, language):
  60. return [word for word in stopwords if letter not in word]
  61. stop_words.add_filter(remove_letter)
  62. after = get_stop_words(language, False)
  63. for stopword in after:
  64. self.assertFalse(letter in stopword)
  65. self.assertTrue(stop_words.remove_filter(remove_letter))
  66. loader = TestLoader()
  67. test_suite = TestSuite(
  68. [
  69. loader.loadTestsFromTestCase(StopWordsTestCase),
  70. ]
  71. )