Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions summa/preprocessing/textcleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ def init_textcleanner(language, additional_stopwords):
set_stopwords_by_language(language, additional_stopwords)


def split_sentences(text):
def split_sentences(text, sentence_delimiter=None):
processed = replace_abbreviations(text)
return [undo_replacement(sentence) for sentence in get_sentences(processed)]
return [undo_replacement(sentence) for sentence in get_sentences(processed, sentence_delimiter=sentence_delimiter)]


def replace_abbreviations(text):
Expand All @@ -77,9 +77,14 @@ def replace_with_separator(text, separator, regexs):
return result


def get_sentences(text):
for match in RE_SENTENCE.finditer(text):
yield match.group()
def get_sentences(text, sentence_delimiter=None):

if sentence_delimiter:
for sentence in text.split(sentence_delimiter):
yield sentence
else:
for match in RE_SENTENCE.finditer(text):
yield match.group()


# Taken from Gensim
Expand Down Expand Up @@ -158,11 +163,11 @@ def merge_syntactic_units(original_units, filtered_units, tags=None):
return units


def clean_text_by_sentences(text, language="english", additional_stopwords=None):
def clean_text_by_sentences(text, language="english", additional_stopwords=None, sentence_delimiter=None):
""" Tokenizes a given text into sentences, applying filters and lemmatizing them.
Returns a SyntacticUnit list. """
init_textcleanner(language, additional_stopwords)
original_sentences = split_sentences(text)
original_sentences = split_sentences(text, sentence_delimiter=sentence_delimiter)
filtered_sentences = filter_words(original_sentences)

return merge_syntactic_units(original_sentences, filtered_sentences)
Expand Down
4 changes: 2 additions & 2 deletions summa/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,12 @@ def _extract_most_important_sentences(sentences, ratio, words):
return _get_sentences_with_word_count(sentences, words)


def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):
def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None, sentence_delimiter=None):
if not isinstance(text, str):
raise ValueError("Text parameter must be a Unicode object (str)!")

# Gets a list of processed sentences.
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
sentences = _clean_text_by_sentences(text, language, additional_stopwords, sentence_delimiter=sentence_delimiter)

# Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph([sentence.token for sentence in sentences])
Expand Down