Python/text_helper.py at main · NowakAndrzej283/Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import re


class TextToNumConverter:
    def __init__(self):
        self.vocabulary = []  # Lista unikalnych słów
        self.vocab_map = {}  # Słownik {słowo: indeks}

    def fit_transform(self, text_list):
        """
        1. Uczy się słownika z podanych tekstów.
        2. Zwraca macierz liczbową (numpy array).
        """
        # Krok 1: Buduje słownik (unikalne słowa)
        unique_words = set()
        clean_texts = [self._clean(t) for t in text_list]

        for tokens in clean_texts:
            unique_words.update(tokens)

        self.vocabulary = sorted(list(unique_words))
        self.vocab_map = {word: i for i, word in enumerate(self.vocabulary)}

        # Krok 2: Zamienia na liczby
        return self._to_matrix(clean_texts)

    def transform(self, text_list):
        """
        Zamienia nowe teksty na macierz, używając już nauczonego słownika.
        """
        clean_texts = [self._clean(t) for t in text_list]
        return self._to_matrix(clean_texts)

    def _to_matrix(self, clean_texts):
        """
        Tworzy macierz (Liczba Próbek x Liczba Słów w Słowniku).
        """
        n_samples = len(clean_texts)
        n_features = len(self.vocabulary)

        # Tworzymy macierz zer
        matrix = np.zeros((n_samples, n_features), dtype=np.float64)

        for row_idx, tokens in enumerate(clean_texts):
            for token in tokens:
                if token in self.vocab_map:
                    col_idx = self.vocab_map[token]
                    # Wstawiamy 1 jeśli słowo występuje (można też += 1 żeby zliczać)
                    matrix[row_idx, col_idx] += 1.0

        return matrix

    def _clean(self, text):
        """
        Proste czyszczenie: małe litery i usuwanie znaków specjalnych.
        """
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)
        return text.split()