-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathProfanityDetector.cs
More file actions
178 lines (142 loc) · 8.33 KB
/
ProfanityDetector.cs
File metadata and controls
178 lines (142 loc) · 8.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
using NAudio.Wave;
using Spectre.Console;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using Whisper.net;
namespace CurseWordExtractor
{
internal static class ProfanityDetector
{
public static TimeSpan totalDuration;
public static async Task<Queue<ProfanityMatch>> DetectProfanity(string whisperAudioFile, HashSet<string> badWords, string modelPath = "ggml-small.en.bin")
{
var foundMatches = new Queue<ProfanityMatch>();
AnsiConsole.MarkupLineInterpolated($"\n\n\t[underline]Loading Model[/]: [bold]{modelPath}[/]");
using var factory = WhisperFactory.FromPath(modelPath);
AnsiConsole.MarkupLine("\t\t[blue]:small_blue_diamond:[/] Building Processor...");
await using var processor = factory.CreateBuilder().WithLanguage("en").WithProbabilities().WithTokenTimestamps().Build();
AnsiConsole.MarkupLine("\t\t[blue]:small_blue_diamond:[/] Processor built and loaded");
using var reader = new WaveFileReader(whisperAudioFile);
totalDuration = reader.TotalTime;
int secondsPerChunk = 30; // Reset Whisper after 30 seconds to get new context window
int secondsOverlap = 1; // Because of this 1 second overlap, duplicates words can and do occur
// seconds to byte conversion
int bytesPerSecond = reader.WaveFormat.AverageBytesPerSecond;
int bytesPerChunk = bytesPerSecond * secondsPerChunk;
int bytesOverlap = bytesPerSecond * secondsOverlap;
// setup wheelbarrow of data
byte[] buffer = new byte[bytesPerChunk];
AnsiConsole.MarkupLineInterpolated($"\t\t[blue]:small_blue_diamond:[/] Audio Duration: {reader.TotalTime}");
AnsiConsole.MarkupLineInterpolated($"\t\t[blue]:small_blue_diamond:[/] Processing in {secondsPerChunk}s chunks with {secondsOverlap}s overlap...\n");
while (reader.Position < reader.Length) // loop through until all the data has been read
{
TimeSpan currentChunkStartTime = reader.CurrentTime;
int bytesRead = reader.Read(buffer, 0, buffer.Length);
if (bytesRead == 0) break; // if no bytes were read, we are done
float[] pcmData = ConvertBytesToPcm(buffer, bytesRead);
await foreach (var segment in processor.ProcessAsync(pcmData)) // For each sentence do this...
{
AnsiConsole.MarkupLineInterpolated($"\r [dim]➜ \"{segment.Text.Trim()}\"[/]");
// Track Previous Token in case a word merge is needed
string prevTokenText = "";
TimeSpan prevTokenStart = TimeSpan.Zero;
foreach (var token in segment.Tokens) // For each word do this...
{
var result = CheckToken(token.Text, prevTokenText, prevTokenStart,
TimeSpan.FromMilliseconds(token.Start * 10),
TimeSpan.FromMilliseconds(token.End * 10),
badWords);
if (result.Found)
{
var match = TryCreateMatch(result.Word, result.Start, result.End,
currentChunkStartTime, token.Probability,
secondsPerChunk, secondsOverlap,
reader.Position, reader.Length);
if (match is not null)
foundMatches.Enqueue(match);
}
// Store current as previous for the next loop
prevTokenText = token.Text;
prevTokenStart = TimeSpan.FromMilliseconds(token.Start * 10);
}
}
if (reader.Position < reader.Length)
{
reader.Position = reader.Position - bytesOverlap;
}
}
return foundMatches;
}
// Converts raw 16-bit audio bytes into a float[] array for Whisper.
// Each 2-byte pair becomes one normalized PCM sample between -1.0 and 1.0.
private static float[] ConvertBytesToPcm(byte[] buffer, int bytesRead)
{
int sampleCount = bytesRead / 2; // divide by 2 cause audio is 16-bit (each audio sample = 2 bytes). Ex: 1000 bytes = 500 samples
float[] pcmData = new float[sampleCount];
for (int i = 0; i < sampleCount; i++)
{
short sample = BitConverter.ToInt16(buffer, i * 2);
pcmData[i] = sample / 32768.0f; // pulse code modulation = standard method used to digitally represent analog signals. The lowest level
}
return pcmData;
}
// Checks if the current token (or a merge of previous + current) matches a bad word.
// Returns whether a match was found, along with the matched word and its time range.
private static (bool Found, string Word, TimeSpan Start, TimeSpan End) CheckToken(
string currentText, string prevTokenText, TimeSpan prevTokenStart,
TimeSpan tokenStart, TimeSpan tokenEnd, HashSet<string> badWords)
{
string cleanedCurrent = Helpers.RemovePunctuation(currentText);
// Check INDIVIDUAL Token
if (badWords.Contains(cleanedCurrent))
{
return (true, cleanedCurrent, tokenStart, tokenEnd);
}
// Check MERGED Token (Previous + Current)
// Example: "fu" + "cker" = "f*cker"
string mergedRaw = prevTokenText + currentText;
string mergedClean = Helpers.RemovePunctuation(mergedRaw);
if (badWords.Contains(mergedClean))
{
AnsiConsole.MarkupLineInterpolated($"\r [#569CD6]❯[/] [bold white]Split Detected:[/][grey]'{Markup.Escape(prevTokenText)}'[/] [teal]+[/] [grey]'{Markup.Escape(currentText)}'[/] [teal]→[/] [bold red]'{Markup.Escape(mergedClean)}'[/]");
return (true, mergedClean, prevTokenStart, tokenEnd); // Use start time of the FIRST part
}
return (false, "", TimeSpan.Zero, TimeSpan.Zero);
}
// Validates a detected match against overlap and duration rules, applies padding,
//and returns a ProfanityMatch or null if the match should be skipped.
private static ProfanityMatch? TryCreateMatch(string matchWord, TimeSpan matchStart, TimeSpan matchEnd,
TimeSpan chunkStartTime, float confidence,
int secondsPerChunk, int secondsOverlap,
long readerPosition, long readerLength)
{
TimeSpan actualStart = chunkStartTime.Add(matchStart);
TimeSpan actualEnd = chunkStartTime.Add(matchEnd);
// Overlap safety check
if (matchStart.TotalSeconds > (secondsPerChunk - secondsOverlap) && readerPosition < readerLength)
{
return null;
}
// If Whisper claims the word took longer than 1.5 seconds, cap it.
// This prevents loud noises from dragging out a mute forever. Ex: Car chases with loud noises
double maxDurationSeconds = 2;
if ((actualEnd - actualStart).TotalSeconds > maxDurationSeconds)
{
actualEnd = actualStart.Add(TimeSpan.FromSeconds(maxDurationSeconds));
AnsiConsole.MarkupLineInterpolated($"\r [#CE9178]❯[/] [bold white]Length Capped:[/] [grey]'{Markup.Escape(matchWord)}' duration reduced to[/] [bold yellow]2.0s[/]");
}
TimeSpan beepStart = actualStart.Subtract(TimeSpan.FromMilliseconds(200));
TimeSpan beepEnd = actualEnd.Add(TimeSpan.FromMilliseconds(400));
if (beepStart < TimeSpan.Zero) beepStart = TimeSpan.Zero;
AnsiConsole.MarkupLineInterpolated($"\r [bold #F44747]❯[/] [bold white]Potty word found:[/] [bold #F44747]'{Markup.Escape(matchWord)}'[/] [grey]at[/] [underline #DCDCAA]{actualStart:hh\\:mm\\:ss\\.fff}[/]");
return new ProfanityMatch
{
Word = matchWord,
Confidence = confidence,
Start = beepStart,
End = beepEnd
};
}
}
}