From a07b0d7da735e07441ee7fbefd0197b5ca13d8fe Mon Sep 17 00:00:00 2001
From: fsimonjetz <fabian.simonjetz@rub.de>
Date: Fri, 3 May 2019 09:49:17 +0200
Subject: [PATCH 1/3] add sentence delimiter option to preserve original
 sentencization

---
 summa/preprocessing/textcleaner.py | 19 ++++++++++++-------
 summa/summarizer.py                |  4 ++--
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/summa/preprocessing/textcleaner.py b/summa/preprocessing/textcleaner.py
index 9716774..c1d4e8e 100644
--- a/summa/preprocessing/textcleaner.py
+++ b/summa/preprocessing/textcleaner.py
@@ -56,9 +56,9 @@ def init_textcleanner(language, additional_stopwords):
     set_stopwords_by_language(language, additional_stopwords)
 
 
-def split_sentences(text):
+def split_sentences(text, sentence_delimiter=None):
     processed = replace_abbreviations(text)
-    return [undo_replacement(sentence) for sentence in get_sentences(processed)]
+    return [undo_replacement(sentence) for sentence in get_sentences(processed, sentence_delimiter=sentence_delimiter)]
 
 
 def replace_abbreviations(text):
@@ -77,9 +77,14 @@ def replace_with_separator(text, separator, regexs):
     return result
 
 
-def get_sentences(text):
-    for match in RE_SENTENCE.finditer(text):
-        yield match.group()
+def get_sentences(text, sentence_delimiter=None):
+    
+    if sentence_delimiter:
+        for sentence in text.split(sentence_delimiter):
+            yield sentence
+    else:
+        for match in RE_SENTENCE.finditer(text):
+            yield match.group()
 
 
 # Taken from Gensim
@@ -158,11 +163,11 @@ def merge_syntactic_units(original_units, filtered_units, tags=None):
     return units
 
 
-def clean_text_by_sentences(text, language="english", additional_stopwords=None):
+def clean_text_by_sentences(text, language="english", additional_stopwords=None, sentence_delimiter=None):
     """ Tokenizes a given text into sentences, applying filters and lemmatizing them.
     Returns a SyntacticUnit list. """
     init_textcleanner(language, additional_stopwords)
-    original_sentences = split_sentences(text)
+    original_sentences = split_sentences(text, sentence_delimiter=sentence_delimiter)
     filtered_sentences = filter_words(original_sentences)
 
     return merge_syntactic_units(original_sentences, filtered_sentences)
diff --git a/summa/summarizer.py b/summa/summarizer.py
index 1ab296d..717f4c3 100644
--- a/summa/summarizer.py
+++ b/summa/summarizer.py
@@ -109,12 +109,12 @@ def _extract_most_important_sentences(sentences, ratio, words):
         return _get_sentences_with_word_count(sentences, words)
 
 
-def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):
+def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None, sentence_delimiter=None):
     if not isinstance(text, str):
         raise ValueError("Text parameter must be a Unicode object (str)!")
 
     # Gets a list of processed sentences.
-    sentences = _clean_text_by_sentences(text, language, additional_stopwords)
+    sentences = _clean_text_by_sentences(text, language, additional_stopwords, sentence_delimiter=sentence_delimiter)
 
     # Creates the graph and calculates the similarity coefficient for every pair of nodes.
     graph = _build_graph([sentence.token for sentence in sentences])

From 4912ee4fd8bb1bfed31300bcb0bdb23e77e9c11c Mon Sep 17 00:00:00 2001
From: fsimonjetz <fabian.simonjetz@rub.de>
Date: Fri, 3 May 2019 10:30:19 +0200
Subject: [PATCH 2/3] Update readme

---
 README | 103 +++++----------------------------------------------------
 1 file changed, 8 insertions(+), 95 deletions(-)

diff --git a/README b/README
index 59951a4..bdf2624 100644
--- a/README
+++ b/README
@@ -1,103 +1,16 @@
-================
-summa – textrank
-================
+===========================================
+summa – textrank with sentence preservation
+===========================================
 
-TextRank implementation for text summarization and keyword extraction in Python 3,
-with `optimizations on the similarity function <https://arxiv.org/pdf/1602.03606.pdf>`_.
-
-
-Features
---------
-
-* Text summarization
-* Keyword extraction
-
-Examples
---------
-
-Text summarization::
-
-    >>> text = """Automatic summarization is the process of reducing a text document with a \
-    computer program in order to create a summary that retains the most important points \
-    of the original document. As the problem of information overload has grown, and as \
-    the quantity of data has increased, so has interest in automatic summarization. \
-    Technologies that can make a coherent summary take into account variables such as \
-    length, writing style and syntax. An example of the use of summarization technology \
-    is search engines such as Google. Document summarization is another."""
-
-    >>> from summa import summarizer
-    >>> print(summarizer.summarize(text))
-    'Automatic summarization is the process of reducing a text document with a computer
-    program in order to create a summary that retains the most important points of the
-    original document.'
-
-
-Keyword extraction::
-
-    >>> from summa import keywords
-    >>> print(keywords.keywords(text))
-    document
-    summarization
-    writing
-    account
-
-
-Note that line breaks in the input will be used as sentence separators, so be sure
-to preprocess your text accordingly.
-
-Installation
-------------
-
-This software is `available in PyPI <https://pypi.org/project/summa/>`_.
-It depends on `NumPy <http://www.numpy.org/>`_ and `Scipy <https://www.scipy.org/>`_,
-two Python libraries for scientific computing.
-Pip will automatically install them along with `summa`::
-
-    pip install summa
-
-For a better performance of keyword extraction, install `Pattern <http://www.clips.ua.ac.be/pattern>`_.
-
-
-More examples
--------------
-
-- Command-line usage::
-
-    textrank -t FILE
-
-- Define length of the summary as a proportion of the text (also available in :code:`keywords`)::
-
-    >>> from summa.summarizer import summarize
-    >>> summarize(text, ratio=0.2)
-
-- Define length of the summary by aproximate number of words (also available in :code:`keywords`)::
-
-    >>> summarize(text, words=50)
-
-- Define input text language (also available in :code:`keywords`).
-
-  The available languages are arabic, danish, dutch, english, finnish, french, german,
-  hungarian, italian, norwegian, polish, porter, portuguese, romanian, russian,
-  spanish and swedish::
-
-
-    >>> summarize(text, language='spanish')
-
-- Get results as a list (also available in :code:`keywords`)::
-
-    >>> summarize(text, split=True)
-    ['Automatic summarization is the process of reducing a text document with a
-    computer program in order to create a summary that retains the most important
-    points of the original document.']
+This is a slightly modified version of [Barrios and L{\'{o}}pez's implementation
+of textrank](https://github.com/summanlp/textrank) that adds a `sentence_delimiter` 
+option to the `summarize` function that allows you to override the built-in 
+sentence splitter. This is important if you need to preserve the sentences for 
+data consistency.
 
 
 References
 -------------
-- Mihalcea, R., Tarau, P.:
-  `"Textrank: Bringing order into texts" <http://www.aclweb.org/anthology/W04-3252>`__.
-  In: Lin, D., Wu, D. (eds.)
-  Proceedings of EMNLP 2004. pp. 404–411. Association for Computational Linguistics,
-  Barcelona, Spain. July 2004.
 
 - Barrios, F., López, F., Argerich, L., Wachenchauzer, R.:
   `"Variations of the Similarity Function of TextRank for Automated Summarization" <https://arxiv.org/pdf/1602.03606.pdf>`__.

From 6c0c77e2f007c7dc555e9f238a31484ef06a387e Mon Sep 17 00:00:00 2001
From: fsimonjetz <fabian.simonjetz@rub.de>
Date: Fri, 3 May 2019 10:34:34 +0200
Subject: [PATCH 3/3] Revert "Update readme"

This reverts commit 4912ee4fd8bb1bfed31300bcb0bdb23e77e9c11c.
---
 README | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 95 insertions(+), 8 deletions(-)

diff --git a/README b/README
index bdf2624..59951a4 100644
--- a/README
+++ b/README
@@ -1,16 +1,103 @@
-===========================================
-summa – textrank with sentence preservation
-===========================================
+================
+summa – textrank
+================
 
-This is a slightly modified version of [Barrios and L{\'{o}}pez's implementation
-of textrank](https://github.com/summanlp/textrank) that adds a `sentence_delimiter` 
-option to the `summarize` function that allows you to override the built-in 
-sentence splitter. This is important if you need to preserve the sentences for 
-data consistency.
+TextRank implementation for text summarization and keyword extraction in Python 3,
+with `optimizations on the similarity function <https://arxiv.org/pdf/1602.03606.pdf>`_.
+
+
+Features
+--------
+
+* Text summarization
+* Keyword extraction
+
+Examples
+--------
+
+Text summarization::
+
+    >>> text = """Automatic summarization is the process of reducing a text document with a \
+    computer program in order to create a summary that retains the most important points \
+    of the original document. As the problem of information overload has grown, and as \
+    the quantity of data has increased, so has interest in automatic summarization. \
+    Technologies that can make a coherent summary take into account variables such as \
+    length, writing style and syntax. An example of the use of summarization technology \
+    is search engines such as Google. Document summarization is another."""
+
+    >>> from summa import summarizer
+    >>> print(summarizer.summarize(text))
+    'Automatic summarization is the process of reducing a text document with a computer
+    program in order to create a summary that retains the most important points of the
+    original document.'
+
+
+Keyword extraction::
+
+    >>> from summa import keywords
+    >>> print(keywords.keywords(text))
+    document
+    summarization
+    writing
+    account
+
+
+Note that line breaks in the input will be used as sentence separators, so be sure
+to preprocess your text accordingly.
+
+Installation
+------------
+
+This software is `available in PyPI <https://pypi.org/project/summa/>`_.
+It depends on `NumPy <http://www.numpy.org/>`_ and `Scipy <https://www.scipy.org/>`_,
+two Python libraries for scientific computing.
+Pip will automatically install them along with `summa`::
+
+    pip install summa
+
+For a better performance of keyword extraction, install `Pattern <http://www.clips.ua.ac.be/pattern>`_.
+
+
+More examples
+-------------
+
+- Command-line usage::
+
+    textrank -t FILE
+
+- Define length of the summary as a proportion of the text (also available in :code:`keywords`)::
+
+    >>> from summa.summarizer import summarize
+    >>> summarize(text, ratio=0.2)
+
+- Define length of the summary by aproximate number of words (also available in :code:`keywords`)::
+
+    >>> summarize(text, words=50)
+
+- Define input text language (also available in :code:`keywords`).
+
+  The available languages are arabic, danish, dutch, english, finnish, french, german,
+  hungarian, italian, norwegian, polish, porter, portuguese, romanian, russian,
+  spanish and swedish::
+
+
+    >>> summarize(text, language='spanish')
+
+- Get results as a list (also available in :code:`keywords`)::
+
+    >>> summarize(text, split=True)
+    ['Automatic summarization is the process of reducing a text document with a
+    computer program in order to create a summary that retains the most important
+    points of the original document.']
 
 
 References
 -------------
+- Mihalcea, R., Tarau, P.:
+  `"Textrank: Bringing order into texts" <http://www.aclweb.org/anthology/W04-3252>`__.
+  In: Lin, D., Wu, D. (eds.)
+  Proceedings of EMNLP 2004. pp. 404–411. Association for Computational Linguistics,
+  Barcelona, Spain. July 2004.
 
 - Barrios, F., López, F., Argerich, L., Wachenchauzer, R.:
   `"Variations of the Similarity Function of TextRank for Automated Summarization" <https://arxiv.org/pdf/1602.03606.pdf>`__.