-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_stuff.py
More file actions
85 lines (65 loc) · 3.54 KB
/
Copy pathextract_stuff.py
File metadata and controls
85 lines (65 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import pandas as pd
def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, debug=False):
if os.path.exists(file_path):
# If the file exists, load the augmented dataset from the CSV file
print(f"Loading augmented dataset from {file_path}")
augmented_dataset = pd.read_csv(file_path)
else:
# If the file does not exist, proceed with augmenting the dataset
print(f"Augmenting dataset and saving to {file_path}")
total_rows = len(dataset)
count = 0
topics = []
# Iterate over each row in the dataset
for index, row in dataset.iterrows():
# Extract topic using the extractor
topic = extractor.extract_topic(row['text'], topic_labels)
topics.append(topic)
for label in topic_labels:
dataset.at[index, label] = 1 if topic == label else 0
# If debug mode is enabled, print debug information
percentage_complete = ((count + 1) / total_rows) * 100
if debug:
print(f"DEBUG - Text: {row['text']}")
print(f"DEBUG - Generated Metadata: Topic - {topic}")
print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
if int(percentage_complete) % 5 == 0:
print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
count += 1
dataset['topic'] = topics
dataset.to_csv(file_path, index=False)
augmented_dataset = dataset
return augmented_dataset
def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch_size=32):
if os.path.exists(file_path):
# If the file exists, load the sentiment-augmented dataset from the CSV file
print(f"Loading sentiment-augmented dataset from {file_path}")
sentiment_augmented_dataset = pd.read_csv(file_path)
else:
# If the file does not exist, proceed with sentiment prediction
print(f"Predicting sentiment and saving to {file_path}")
total_rows = len(dataset)
sentiments = []
# Process the dataset in batches
for start in range(0, total_rows, batch_size):
end = min(start + batch_size, total_rows)
# Extract a batch of texts from the dataset
batch_texts = dataset['text'][start:end].tolist()
# Truncate texts to the model's maximum token length
truncated_batch_texts = [sentiment_analyzer.truncate_text(text) for text in batch_texts]
batch_results = sentiment_analyzer.classifier(truncated_batch_texts, truncation=True, padding=True,
max_length=512)
batch_sentiments = [sentiment_analyzer.map_label_to_target(result['label']) for result in batch_results]
sentiments.extend(batch_sentiments)
# Calculate the percentage of completion
percentage_complete = (end / total_rows) * 100
if debug:
print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}")
print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
if int(percentage_complete) % 5 == 0:
print(f"Percentage of Completion: {percentage_complete:.2f}%")
dataset['sentiment'] = sentiments
dataset.to_csv(file_path, index=False)
sentiment_augmented_dataset = dataset
return sentiment_augmented_dataset