From a07b0d7da735e07441ee7fbefd0197b5ca13d8fe Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 3 May 2019 09:49:17 +0200 Subject: [PATCH 1/3] add sentence delimiter option to preserve original sentencization --- summa/preprocessing/textcleaner.py | 19 ++++++++++++------- summa/summarizer.py | 4 ++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/summa/preprocessing/textcleaner.py b/summa/preprocessing/textcleaner.py index 9716774..c1d4e8e 100644 --- a/summa/preprocessing/textcleaner.py +++ b/summa/preprocessing/textcleaner.py @@ -56,9 +56,9 @@ def init_textcleanner(language, additional_stopwords): set_stopwords_by_language(language, additional_stopwords) -def split_sentences(text): +def split_sentences(text, sentence_delimiter=None): processed = replace_abbreviations(text) - return [undo_replacement(sentence) for sentence in get_sentences(processed)] + return [undo_replacement(sentence) for sentence in get_sentences(processed, sentence_delimiter=sentence_delimiter)] def replace_abbreviations(text): @@ -77,9 +77,14 @@ def replace_with_separator(text, separator, regexs): return result -def get_sentences(text): - for match in RE_SENTENCE.finditer(text): - yield match.group() +def get_sentences(text, sentence_delimiter=None): + + if sentence_delimiter: + for sentence in text.split(sentence_delimiter): + yield sentence + else: + for match in RE_SENTENCE.finditer(text): + yield match.group() # Taken from Gensim @@ -158,11 +163,11 @@ def merge_syntactic_units(original_units, filtered_units, tags=None): return units -def clean_text_by_sentences(text, language="english", additional_stopwords=None): +def clean_text_by_sentences(text, language="english", additional_stopwords=None, sentence_delimiter=None): """ Tokenizes a given text into sentences, applying filters and lemmatizing them. Returns a SyntacticUnit list. """ init_textcleanner(language, additional_stopwords) - original_sentences = split_sentences(text) + original_sentences = split_sentences(text, sentence_delimiter=sentence_delimiter) filtered_sentences = filter_words(original_sentences) return merge_syntactic_units(original_sentences, filtered_sentences) diff --git a/summa/summarizer.py b/summa/summarizer.py index 1ab296d..717f4c3 100644 --- a/summa/summarizer.py +++ b/summa/summarizer.py @@ -109,12 +109,12 @@ def _extract_most_important_sentences(sentences, ratio, words): return _get_sentences_with_word_count(sentences, words) -def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None): +def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None, sentence_delimiter=None): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") # Gets a list of processed sentences. - sentences = _clean_text_by_sentences(text, language, additional_stopwords) + sentences = _clean_text_by_sentences(text, language, additional_stopwords, sentence_delimiter=sentence_delimiter) # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) From 4912ee4fd8bb1bfed31300bcb0bdb23e77e9c11c Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 3 May 2019 10:30:19 +0200 Subject: [PATCH 2/3] Update readme --- README | 103 +++++---------------------------------------------------- 1 file changed, 8 insertions(+), 95 deletions(-) diff --git a/README b/README index 59951a4..bdf2624 100644 --- a/README +++ b/README @@ -1,103 +1,16 @@ -================ -summa – textrank -================ +=========================================== +summa – textrank with sentence preservation +=========================================== -TextRank implementation for text summarization and keyword extraction in Python 3, -with `optimizations on the similarity function `_. - - -Features --------- - -* Text summarization -* Keyword extraction - -Examples --------- - -Text summarization:: - - >>> text = """Automatic summarization is the process of reducing a text document with a \ - computer program in order to create a summary that retains the most important points \ - of the original document. As the problem of information overload has grown, and as \ - the quantity of data has increased, so has interest in automatic summarization. \ - Technologies that can make a coherent summary take into account variables such as \ - length, writing style and syntax. An example of the use of summarization technology \ - is search engines such as Google. Document summarization is another.""" - - >>> from summa import summarizer - >>> print(summarizer.summarize(text)) - 'Automatic summarization is the process of reducing a text document with a computer - program in order to create a summary that retains the most important points of the - original document.' - - -Keyword extraction:: - - >>> from summa import keywords - >>> print(keywords.keywords(text)) - document - summarization - writing - account - - -Note that line breaks in the input will be used as sentence separators, so be sure -to preprocess your text accordingly. - -Installation ------------- - -This software is `available in PyPI `_. -It depends on `NumPy `_ and `Scipy `_, -two Python libraries for scientific computing. -Pip will automatically install them along with `summa`:: - - pip install summa - -For a better performance of keyword extraction, install `Pattern `_. - - -More examples -------------- - -- Command-line usage:: - - textrank -t FILE - -- Define length of the summary as a proportion of the text (also available in :code:`keywords`):: - - >>> from summa.summarizer import summarize - >>> summarize(text, ratio=0.2) - -- Define length of the summary by aproximate number of words (also available in :code:`keywords`):: - - >>> summarize(text, words=50) - -- Define input text language (also available in :code:`keywords`). - - The available languages are arabic, danish, dutch, english, finnish, french, german, - hungarian, italian, norwegian, polish, porter, portuguese, romanian, russian, - spanish and swedish:: - - - >>> summarize(text, language='spanish') - -- Get results as a list (also available in :code:`keywords`):: - - >>> summarize(text, split=True) - ['Automatic summarization is the process of reducing a text document with a - computer program in order to create a summary that retains the most important - points of the original document.'] +This is a slightly modified version of [Barrios and L{\'{o}}pez's implementation +of textrank](https://github.com/summanlp/textrank) that adds a `sentence_delimiter` +option to the `summarize` function that allows you to override the built-in +sentence splitter. This is important if you need to preserve the sentences for +data consistency. References ------------- -- Mihalcea, R., Tarau, P.: - `"Textrank: Bringing order into texts" `__. - In: Lin, D., Wu, D. (eds.) - Proceedings of EMNLP 2004. pp. 404–411. Association for Computational Linguistics, - Barcelona, Spain. July 2004. - Barrios, F., López, F., Argerich, L., Wachenchauzer, R.: `"Variations of the Similarity Function of TextRank for Automated Summarization" `__. From 6c0c77e2f007c7dc555e9f238a31484ef06a387e Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 3 May 2019 10:34:34 +0200 Subject: [PATCH 3/3] Revert "Update readme" This reverts commit 4912ee4fd8bb1bfed31300bcb0bdb23e77e9c11c. --- README | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 95 insertions(+), 8 deletions(-) diff --git a/README b/README index bdf2624..59951a4 100644 --- a/README +++ b/README @@ -1,16 +1,103 @@ -=========================================== -summa – textrank with sentence preservation -=========================================== +================ +summa – textrank +================ -This is a slightly modified version of [Barrios and L{\'{o}}pez's implementation -of textrank](https://github.com/summanlp/textrank) that adds a `sentence_delimiter` -option to the `summarize` function that allows you to override the built-in -sentence splitter. This is important if you need to preserve the sentences for -data consistency. +TextRank implementation for text summarization and keyword extraction in Python 3, +with `optimizations on the similarity function `_. + + +Features +-------- + +* Text summarization +* Keyword extraction + +Examples +-------- + +Text summarization:: + + >>> text = """Automatic summarization is the process of reducing a text document with a \ + computer program in order to create a summary that retains the most important points \ + of the original document. As the problem of information overload has grown, and as \ + the quantity of data has increased, so has interest in automatic summarization. \ + Technologies that can make a coherent summary take into account variables such as \ + length, writing style and syntax. An example of the use of summarization technology \ + is search engines such as Google. Document summarization is another.""" + + >>> from summa import summarizer + >>> print(summarizer.summarize(text)) + 'Automatic summarization is the process of reducing a text document with a computer + program in order to create a summary that retains the most important points of the + original document.' + + +Keyword extraction:: + + >>> from summa import keywords + >>> print(keywords.keywords(text)) + document + summarization + writing + account + + +Note that line breaks in the input will be used as sentence separators, so be sure +to preprocess your text accordingly. + +Installation +------------ + +This software is `available in PyPI `_. +It depends on `NumPy `_ and `Scipy `_, +two Python libraries for scientific computing. +Pip will automatically install them along with `summa`:: + + pip install summa + +For a better performance of keyword extraction, install `Pattern `_. + + +More examples +------------- + +- Command-line usage:: + + textrank -t FILE + +- Define length of the summary as a proportion of the text (also available in :code:`keywords`):: + + >>> from summa.summarizer import summarize + >>> summarize(text, ratio=0.2) + +- Define length of the summary by aproximate number of words (also available in :code:`keywords`):: + + >>> summarize(text, words=50) + +- Define input text language (also available in :code:`keywords`). + + The available languages are arabic, danish, dutch, english, finnish, french, german, + hungarian, italian, norwegian, polish, porter, portuguese, romanian, russian, + spanish and swedish:: + + + >>> summarize(text, language='spanish') + +- Get results as a list (also available in :code:`keywords`):: + + >>> summarize(text, split=True) + ['Automatic summarization is the process of reducing a text document with a + computer program in order to create a summary that retains the most important + points of the original document.'] References ------------- +- Mihalcea, R., Tarau, P.: + `"Textrank: Bringing order into texts" `__. + In: Lin, D., Wu, D. (eds.) + Proceedings of EMNLP 2004. pp. 404–411. Association for Computational Linguistics, + Barcelona, Spain. July 2004. - Barrios, F., López, F., Argerich, L., Wachenchauzer, R.: `"Variations of the Similarity Function of TextRank for Automated Summarization" `__.