Binary-Context-Transformer/binarycontexttransformer.py at master · USDepartmentofLabor/Binary-Context-Transformer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import numpy as np
import scipy as sp
from sklearn.base import TransformerMixin
from scipy.sparse import csc_matrix, csr_matrix


class BinaryContextTransformer(TransformerMixin):
    """
    Expands base features into interaction terms when they appear with
    different context features. Base features are variables that may have different
    meanings in different contexts. Context features are indicator variables that
    denote which context a record belongs to. Both base features and context features
    must be binary.
    """

    def __init__(self, features, contexts, progress=None):
        """
        Args:
            features: names of base feature columns for input matrix
            contexts: names of context feature columns for input matrix
            progress: function of format progress_fn(iter, total) that takes
                an iterable and an integer with the total number of items and
                returns a generator to track progress at each step of the
                iterable (default=None)
        """
        self.features = features
        self.contexts = contexts
        self.col_pairs = []
        self.progress = progress
        self.vocabulary = {}

    def fit(self, X, X_context):
        """
        Args:
            X: input matrix, base feature columns
            X_context: input matrix, context feature columns
        """
        assert X.shape[1] == len(self.features), "X not same size as base."
        assert X_context.shape[1] == len(
            self.contexts
        ), "X_context not same size as context."
        if not isinstance(X, csc_matrix):
            X = csc_matrix(X)
        if not isinstance(X_context, csc_matrix):
            X_context = csc_matrix(X_context)
        looper = range(X_context.shape[1])
        if self.progress is not None:
            looper = self.progress(looper, total=X_context.shape[1])
        # Find possible interactions from the sparse input matrix.
        blocks = []
        # If each record appears in only one context, the runtime complexity
        # of this loop is O(S), where S = the number of entries in the sparse
        # matrix. Each row will be selected only once and the call to max()
        # for a sparse matrix will only consider nonzero entries in the row.
        # For sparse matrices, N < S << N x B.
        for i in looper:
            # Get row indices of records that match context i
            row_list = X_context[:, i].indices
            if len(row_list) > 0:
                # Squash rows into binary mask for each feature
                # 1 if feature and context co-occur, 0 otherwise
                row_vals = X[row_list, :].max(axis=0)
                blocks.append(row_vals)
        # The variable `S` is a matrix where each row is a context and each
        # column is a feature, nonzero entries are possible interactions.
        S = sp.sparse.vstack(blocks)
        # Get column indices of features that occur in at least 2 contexts
        feature_idxs = csr_matrix(S.sum(axis=0) - 1).indices
        S = csc_matrix(S)
        # Make vocabulary
        col_pairs = []
        vocab = {}
        k = 0
        # The runtime complexity of this loop is O(V), where V is the number
        # of interaction terms in the resulting vocabulary. In the worst case,
        # when every feature appears in every context, V = B x C. When interactions
        # are sparse, V << B x C.
        looper = feature_idxs
        if self.progress is not None:
            looper = self.progress(looper, total=len(feature_idxs))
        for j in looper:
            context_idcs = S[:, j].indices
            for i in context_idcs:
                col_pairs.append((i, j))
                feature_name = self.features[j]
                context_name = self.contexts[i]
                name = context_name + "_x_" + feature_name
                vocab[name] = k
                k += 1
        self.col_pairs = col_pairs
        self.vocabulary = vocab
        # Check that vocabulary is correct size, sizes will not match
        # if features or contexts contain duplicate feature names.
        # This may occur when joining multiple vocabularies to form
        # the base feature names.
        msg_len = (
            "Length of `vocab` does not match `col_pairs`. ",
            "Check for duplicate feature names.",
        )
        assert len(col_pairs) == len(vocab), msg_len
        return self

    def transform(self, X, X_context):
        """
        Args:
            X: input matrix, base feature columns
            X_context: input matrix, context feature columns
        """
        assert X.shape[1] == len(self.features), "X not same size as base."
        assert X_context.shape[1] == len(
            self.contexts
        ), "X_context not same size as context."
        if not isinstance(X, csr_matrix):
            X = csr_matrix(X)
        if not isinstance(X_context, csr_matrix):
            X_context = csr_matrix(X_context)
        n = X.shape[0]
        m = len(self.col_pairs)
        data = []
        cols = []
        rows = []
        val = 1

        # The runtime complexity of this loop is O(V). See `fit` method
        # for notes on V, the size of the fitted vocabulary.
        col_pair_map = {}
        for k, (i, j) in enumerate(self.col_pairs):
            col_pair_map[(i, j)] = k
        looper = range(n)
        if self.progress is not None:
            looper = self.progress(looper, total=n)
        # If each record appears in only one context, the runtime complexity
        # of this loop is O(S) where S is the number of entries in the sparse
        # matrix. See `fit` method for notes on S.
        for r in looper:
            contexts = X_context[r, :].indices
            features = X[r, :].indices
            for i in contexts:
                for j in features:
                    pair = (i, j)
                    if pair in col_pair_map:
                        k = col_pair_map[pair]
                        data.append(val)
                        rows.append(r)
                        cols.append(k)

        mat = csc_matrix((data, (rows, cols)), shape=(n, m), dtype=np.int8)
        return mat

    def fit_transform(self, X, X_context):
        """
        Args:
            X: input matrix, base feature columns
            X_context: input matrix, context feature columns
        """
        assert X.shape[1] == len(self.features), "X not same size as base."
        assert X_context.shape[1] == len(
            self.contexts
        ), "X_context not same size as context."
        self.fit(X, X_context)
        return self.transform(X, X_context)

    def get_feature_names(self):
        """
        Returns a list of feature names corresponding to column indices.
        """
        vocab = sorted(self.vocabulary.items(), key=lambda p: p[1])
        return [name for name, i in vocab]