reviseParagraphClassification returns paragraphs, but it doesn't look like the paragraphs get manipulated in any way, and that reviseParagraphs is where the new filtered data is. Is reviseParagraphs supposed to be returned instead of paragraphs ?
I added the python reference code for convenience.
def revise_paragraph_classification(paragraphs, max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT):
"""
Context-sensitive paragraph classification. Assumes that classify_pragraphs
has already been called.
"""
# copy classes
for paragraph in paragraphs:
paragraph.class_type = paragraph.cf_class
# good headings
for i, paragraph in enumerate(paragraphs):
if not (paragraph.heading and paragraph.class_type == 'short'):
continue
j = i + 1
distance = 0
while j < len(paragraphs) and distance <= max_heading_distance:
if paragraphs[j].class_type == 'good':
paragraph.class_type = 'neargood'
break
distance += len(paragraphs[j].text)
j += 1
# classify short
new_classes = {}
for i, paragraph in enumerate(paragraphs):
if paragraph.class_type != 'short':
continue
prev_neighbour = get_prev_neighbour(i, paragraphs, ignore_neargood=True)
next_neighbour = get_next_neighbour(i, paragraphs, ignore_neargood=True)
neighbours = set((prev_neighbour, next_neighbour))
if neighbours == set(['good']):
new_classes[i] = 'good'
elif neighbours == set(['bad']):
new_classes[i] = 'bad'
# it must be set(['good', 'bad'])
elif (prev_neighbour == 'bad' and get_prev_neighbour(i, paragraphs, ignore_neargood=False) == 'neargood') or \
(next_neighbour == 'bad' and get_next_neighbour(i, paragraphs, ignore_neargood=False) == 'neargood'):
new_classes[i] = 'good'
else:
new_classes[i] = 'bad'
for i, c in new_classes.items():
paragraphs[i].class_type = c
# revise neargood
for i, paragraph in enumerate(paragraphs):
if paragraph.class_type != 'neargood':
continue
prev_neighbour = get_prev_neighbour(i, paragraphs, ignore_neargood=True)
next_neighbour = get_next_neighbour(i, paragraphs, ignore_neargood=True)
if (prev_neighbour, next_neighbour) == ('bad', 'bad'):
paragraph.class_type = 'bad'
else:
paragraph.class_type = 'good'
# more good headings
for i, paragraph in enumerate(paragraphs):
if not (paragraph.heading and paragraph.class_type == 'bad' and paragraph.cf_class != 'bad'):
continue
j = i + 1
distance = 0
while j < len(paragraphs) and distance <= max_heading_distance:
if paragraphs[j].class_type == 'good':
paragraph.class_type = 'good'
break
distance += len(paragraphs[j].text)
j += 1
reviseParagraphClassificationreturnsparagraphs, but it doesn't look like theparagraphsget manipulated in any way, and thatreviseParagraphsis where the new filtered data is. IsreviseParagraphssupposed to be returned instead ofparagraphs?I added the python reference code for convenience.