From 43f53031f90cf9d391cf7723b8a19489d55885aa Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 11 Apr 2023 14:43:25 -0400 Subject: [PATCH 01/31] R - delete lightgrep scanners not ported, edit Makefile.am accordingly --- src/Makefile.am | 8 +- src/scan_accts_lg.cpp | 749 ----------------------------------------- src/scan_base16_lg.cpp | 220 ------------ src/scan_email_lg.cpp | 511 ---------------------------- src/scan_gps_lg.cpp | 212 ------------ 5 files changed, 2 insertions(+), 1698 deletions(-) delete mode 100644 src/scan_accts_lg.cpp delete mode 100644 src/scan_base16_lg.cpp delete mode 100644 src/scan_email_lg.cpp delete mode 100644 src/scan_gps_lg.cpp diff --git a/src/Makefile.am b/src/Makefile.am index ebefef8a..c7d932b3 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -48,15 +48,11 @@ scan_accts.o: sbuf_flex_scanner.h scan_email.o: sbuf_flex_scanner.h scan_gps.o: sbuf_flex_scanner.h -# These scanners are based on Lightbox Technology's lightgrep +# These scanners are based on Stroz Friedberg's lightgrep lightgrep_scanners = \ pattern_scanner.cpp pattern_scanner.h \ pattern_scanner_utils.cpp pattern_scanner_utils.h \ - scan_lightgrep.cpp \ - scan_accts_lg.cpp \ - scan_base16_lg.cpp \ - scan_email_lg.cpp \ - scan_gps_lg.cpp + scan_lightgrep.cpp # scanners_builtin are the scanners that are compiled into the binary diff --git a/src/scan_accts_lg.cpp b/src/scan_accts_lg.cpp deleted file mode 100644 index be241eaf..00000000 --- a/src/scan_accts_lg.cpp +++ /dev/null @@ -1,749 +0,0 @@ -#include "config.h" - -// if liblightgrep isn't present, compiles to nothing -#ifdef HAVE_LIBLIGHTGREP - -#include -#include - -#include "be20_api/scanner_params.h" -#include "histogram.h" -#include "scan_ccns2.h" -#include "pattern_scanner.h" -#include "pattern_scanner_utils.h" - -namespace accts { - const char* const DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; - - const vector DefaultEncodings( - DefaultEncodingsCStrings, - DefaultEncodingsCStrings + - sizeof(DefaultEncodingsCStrings)/sizeof(DefaultEncodingsCStrings[0]) - ); - - const vector OnlyUTF8Encoding(1, "UTF-8"); - - const vector OnlyUTF16LEEncoding(1, "UTF-16LE"); - - const LG_KeyOptions DefaultOptions = { 0, 1 }; // patterns, case-insensitive - - // - // helper functions - // - - bool is_pdf_box(const sbuf_t& sbuf, size_t pos) { - const char box[] = "Box"; - const size_t c0 = pos >= 10 ? pos - 10 : 10 - pos - 1; - const uint8_t* i = search(sbuf.buf + c0, sbuf.buf + pos, box, box + strlen(box)); - return i != sbuf.buf + pos; -/* - return i != sbuf.buf + pos && ( - (i + 2 < sbuf.buf + pos && *(i+1) == ' ' && *(i+2) == '[') - || *(i+1) == '[' - ); -*/ - } - - inline bool valid_char(char ch) { - return isdigit(ch) || isspace(ch) || ch=='[' || ch==']' || - ch=='<' || ch=='Z' || ch=='.' || ch=='l' || ch=='j'; - } - - bool valid_phone_utf16le(const sbuf_t& sbuf, size_t pos, size_t len) { - // We want invalid characters before and after (assuming there is a - // before and after) - bool invalid_before = false; - bool invalid_after = false; - - if (pos > 16) { - for (size_t i = pos-16; i < pos; ++i) { - if (sbuf[i] != '\0' && !valid_char(sbuf[i])) { - invalid_before = true; - break; - } - } - } - else { - invalid_before = true; - } - - if (sbuf.bufsize < pos+len+16) { - for (size_t i = pos+len; i < pos+len+16; ++i) { - if (sbuf[i] != '\0' && !valid_char(sbuf[i])) { - invalid_after = true; - break; - } - } - } - else { - invalid_after = true; - } - - /* - * 2013-05-28: if followed by ' #{1,5} ' then it's not a phone either! - */ - if (pos+len+10 < sbuf.bufsize) { - if (sbuf[pos+len] == ' ' && sbuf[pos+len+1] == '\0' && - isdigit(sbuf[pos+len+2]) && sbuf[pos+len+3] == '\0') { - for (size_t i = pos+len+2; i+3 < sbuf.bufsize && i < pos+len+16; i += 2) { - if (isdigit(sbuf[i]) && sbuf[i+1] == '\0' && - sbuf[i+2] == ' ' && sbuf[i+3] == '\0') { - return false; // not valid - } - } - } - } - - /* If it is followed by a dash and a number, it's not a phone number */ - if (pos+len+4 < sbuf.bufsize) { - if (sbuf[pos+len] == '-' && sbuf[pos+len+1] == '\0' && - isdigit(sbuf[pos+len+2] && sbuf[pos+len+3] == '\0')) { - return false; - } - } - - return invalid_before && invalid_after; - } - - // - // subpatterns - // - -// const string END("([^0-9e.]|(\\.[^0-9]))"); - const string END("([^\\z2E\\z30-\\z39\\z45\\z65]|(\\.[^\\z30-\\z39]))"); - const string BLOCK("[0-9]{4}"); - const string DELIM("[- ]"); - const string DB("(" + BLOCK + DELIM + ")"); - const string SDB("([45][0-9]{3}" + DELIM + ")"); - const string TDEL("[ /.-]"); - - const string PHONETEXT_UTF8_CTX("[^\\z41-\\z5A\\z61-\\z7A]"); - const string PHONETEXT_UTF16LE_CTX("([^\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])"); - const string PHONETEXT_COMMON("(tel[.ephon]*|fax|facsimile|DSN|telex|TTD|mobile|cell):?"); - const string PHONETEXT_UTF8("(" + PHONETEXT_UTF8_CTX + PHONETEXT_COMMON + ")"); - const string PHONETEXT_UTF16LE("(" + PHONETEXT_UTF16LE_CTX + PHONETEXT_COMMON + ")"); - - const string YEAR("(19[0-9][0-9]|20[01][0-9])"); - const string MONTH("(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?|0?[1-9]|1[0-2])"); - const string DAY("([0-2]?[0-9]|3[01])"); - - const string SYEAR("([0-9][0-9])"); - const string SMONTH("([01][0-2])"); - - const string DATEA("(" + YEAR + "-" + MONTH + "-" + DAY + ")"); - const string DATEB("(" + YEAR + "/" + MONTH + "/" + DAY + ")"); - const string DATEC("(" + DAY + " " + MONTH + " " + YEAR + ")"); - const string DATED("(" + MONTH + " " + DAY + "[, ]+" + YEAR + ")"); - - const string DATEFORMAT("(" + DATEA + "|" + DATEB + "|" + DATEC + "|" + DATED + ")"); - - // - // the scaner - // - - class Scanner: public PatternScanner { - public: - Scanner(): PatternScanner("accts_lg"), CCN_Recorder(0), CCN_Track2_Recorder(0), Telephone_Recorder(0), Alert_Recorder(0), PII_Recorder(0), SIN_Recorder(0) {} - virtual ~Scanner() {} - - virtual Scanner* clone() const { return new Scanner(*this); } - - virtual void startup(const scanner_params& sp); - virtual void init(const scanner_params& sp); - virtual void initScan(const scanner_params&); - - feature_recorder* CCN_Recorder; - feature_recorder* CCN_Track2_Recorder; - feature_recorder* Telephone_Recorder; - feature_recorder* Alert_Recorder; - feature_recorder* PII_Recorder; - feature_recorder* SIN_Recorder; - - void ccnHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void ccnUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void ccnTrack2HitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void ccnTrack2UTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void telephoneHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void telephoneUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void telephoneTrailingCtxHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void telephoneTrailingCtxUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void validatedTelephoneHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void validatedTelephoneUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void bitlockerHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void bitlockerUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void piiHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void piiUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void sinHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void sinUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void sinHitHandler2(const LG_SearchHit& hit, const scanner_params& sp); - - void sinUTF16LEHitHandler2(const LG_SearchHit& hit, const scanner_params& sp); - - void dateHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - private: - Scanner(const Scanner& s): - PatternScanner(s), - CCN_Recorder(s.CCN_Recorder), - CCN_Track2_Recorder(s.CCN_Track2_Recorder), - Telephone_Recorder(s.Telephone_Recorder), - Alert_Recorder(s.Alert_Recorder), - PII_Recorder(s.PII_Recorder), - SIN_Recorder(s.SIN_Recorder) - {} - - Scanner& operator=(const Scanner&); - }; - - void Scanner::startup(const scanner_params& sp) { - sp.check_version(); - - sp.info->name = "accts_lg"; - sp.info->author = "Simson L. Garfinkel, modified by Tim Walsh"; - sp.info->description = "scans for CCNs, track 2, PII (including SSN and Canadian SIN), and phone #s"; - sp.info->scanner_version = "1.0"; - - // define the feature files this scanner creates - sp.info->feature_names.insert("ccn"); - sp.info->feature_names.insert("pii"); // personally identifiable information - sp.info->feature_names.insert("sin"); // canadian social insurance number - sp.info->feature_names.insert("ccn_track2"); - sp.info->feature_names.insert("telephone"); - sp.info->histogram_defs.insert(histogram_def("ccn", "", "histogram")); - sp.info->histogram_defs.insert(histogram_def("ccn_track2", "", "histogram")); - - // define the histograms to make - sp.info->histogram_defs.insert( - histogram_def("telephone", "", "histogram", HistogramMaker::FLAG_NUMERIC) - ); - - scan_ccns2_debug = sp.info->config->debug; // get debug value - } - - void Scanner::init(const scanner_params& sp) { - // - // patterns - // - - // FIXME: leading context - // FIXME: trailing context - /* #### #### #### #### --- most credit card numbers*/ - const string REGEX2("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]" + SDB + DB + DB + BLOCK + END); - - new Handler( - *this, - REGEX2, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ccnHitHandler - ); - - const string REGEX2_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])" + SDB + DB + DB + BLOCK + END); - - new Handler( - *this, - REGEX2_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ccnUTF16LEHitHandler - ); - - // FIXME: leading context - // FIXME: trailing context - /* 3### ###### ######### --- 15 digits beginning with 3 and funny space. */ - /* Must be american express... */ - const string REGEX3("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E]3[0-9]{3}" + DELIM + "[0-9]{6}" + DELIM + "[0-9]{5}" + END); - - new Handler( - *this, - REGEX3, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ccnHitHandler - ); - - const string REGEX3_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E]\\z00|[^\\z00])3[0-9]{3}" + DELIM + "[0-9]{6}" + DELIM + "[0-9]{5}" + END); - - new Handler( - *this, - REGEX3_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ccnUTF16LEHitHandler - ); - - // FIXME: leading context - // FIXME: trailing context - /* 3### ###### ######### --- 15 digits beginning with 3 and funny space. */ - /* Must be american express... */ - const string REGEX4("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E]3[0-9]{14}" + END); - - new Handler( - *this, - REGEX4, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ccnHitHandler - ); - - const string REGEX4_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E]\\z00|[^\\z00])3[0-9]{14}" + END); - - new Handler( - *this, - REGEX4_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ccnUTF16LEHitHandler - ); - - // FIXME: leading context - // FIXME: trailing context - /* ############### 13-19 numbers as a block beginning with a 4 or 5 - * followed by something that is not a digit. - * Yes, CCNs can now be up to 19 digits long. - * http://www.creditcards.com/credit-card-news/credit-card-appearance-1268.php - */ - const string REGEX5("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E][4-6][0-9]{15,18}" + END); - - new Handler( - *this, - REGEX5, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ccnHitHandler - ); - - const string REGEX5_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E]\\z00|[^\\z00])[4-6][0-9]{15,18}" + END); - - new Handler( - *this, - REGEX5_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ccnUTF16LEHitHandler - ); - - // FIXME: leading context - /* ;###############=YYMM101#+? --- track2 credit card data */ - /* {SYEAR}{SMONTH} */ - /* ;CCN=05061010000000000738? */ - const string REGEX6("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A][4-6][0-9]{15,18}=" + SYEAR + SMONTH + "101[0-9]{13}"); - - new Handler( - *this, - REGEX6, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ccnTrack2HitHandler - ); - - const string REGEX6_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])[4-6][0-9]{15,18}=" + SYEAR + SMONTH + "101[0-9]{13}"); - - new Handler( - *this, - REGEX6_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ccnTrack2UTF16LEHitHandler - ); - - // FIXME: trailing context - // FIXME: leading context - /* US phone numbers without area code in parens */ - /* New addition: If proceeded by " ####? ####? " - * then do not consider this a phone number. We see a lot of that stuff in - * PDF files. - */ - const string REGEX7("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]([0-9]{3}" + TDEL + "){2}[0-9]{4}" + END); - - new Handler( - *this, - REGEX7, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::validatedTelephoneHitHandler - ); - - const string REGEX7_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])([0-9]{3}" + TDEL + "){2}[0-9]{4}" + END); - - new Handler( - *this, - REGEX7, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::validatedTelephoneUTF16LEHitHandler - ); - - // FIXME: trailing context - // FIXME: leading context - /* US phone number with parens, like (215) 555-1212 */ - const string REGEX8("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\([0-9]{3}\\)" + TDEL + "?[0-9]{3}" + TDEL + "[0-9]{4}" + END); - - new Handler( - *this, - REGEX8, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::telephoneTrailingCtxHitHandler - ); - - const string REGEX8_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])\\([0-9]{3}\\)" + TDEL + "?[0-9]{3}" + TDEL + "[0-9]{4}" + END); - - new Handler( - *this, - REGEX8_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::telephoneTrailingCtxUTF16LEHitHandler - ); - - // FIXME: trailing context - // FIXME: leading context - /* Generalized international phone numbers */ - const string REGEX9("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\+[0-9]{1,3}(" + TDEL + "[0-9]{2,3}){2,6}[0-9]{2,4}" + END); - - new Handler( - *this, - REGEX9, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::validatedTelephoneHitHandler - ); - - const string REGEX9_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])\\+[0-9]{1,3}(" + TDEL + "[0-9]{2,3}){2,6}[0-9]{2,4}" + END); - - new Handler( - *this, - REGEX9, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::validatedTelephoneHitHandler - ); - - // FIXME: leading context - /* Generalized number with prefix */ - const string REGEX10(PHONETEXT_UTF8 + "[0-9/ .+]{7,18}"); - - new Handler( - *this, - REGEX10, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::telephoneHitHandler - ); - - const string REGEX10_UTF16LE(PHONETEXT_UTF16LE + "[0-9/ .+]{7,18}"); - - new Handler( - *this, - REGEX10_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::telephoneUTF16LEHitHandler - ); - - // FIXME: leading context - /* Generalized number with city code and prefix */ - const string REGEX11(PHONETEXT_UTF8 + "[0-9 +]+ ?\\([0-9]{2,4}\\) ?[\\-0-9]{4,8}"); - - new Handler( - *this, - REGEX11, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::telephoneHitHandler - ); - - const string REGEX11_UTF16LE(PHONETEXT_UTF16LE + "[0-9 +]+ ?\\([0-9]{2,4}\\) ?[\\-0-9]{4,8}"); - - new Handler( - *this, - REGEX11_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::telephoneUTF16LEHitHandler - ); - - // FIXME: trailing context - /* Generalized international phone numbers */ - const string REGEX12("fedex[^a-z]+([0-9]{4}[- ]?){2}[0-9]" + END); - - new Handler( - *this, - REGEX12, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::piiHitHandler - ); - - new Handler( - *this, - REGEX12, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::piiUTF16LEHitHandler - ); - - // FIXME: trailing context - const string REGEX13("ssn:?[ \\t]+[0-9]{3}-?[0-9]{2}-?[0-9]{4}" + END); - - new Handler( - *this, - REGEX13, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::piiHitHandler - ); - - new Handler( - *this, - REGEX13, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::piiUTF16LEHitHandler - ); - - const string REGEX14("dob:?[ \\t]+" + DATEFORMAT); - - new Handler( - *this, - REGEX14, - DefaultEncodings, - DefaultOptions, - &Scanner::dateHitHandler - ); - - // FIXME: trailing context - const string REGEX15("sin:?[ \\t]+[0-9]{3}[ -]?[0-9]{3}[ -]?[0-9]{3}" + END); - - new Handler( - *this, - REGEX15, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::sinHitHandler - ); - - new Handler( - *this, - REGEX15, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::sinUTF16LEHitHandler - ); - - const string REGEX16("[^0-9][0-9]{3}-[0-9]{3}-[0-9]{3}" + END); - - new Handler( - *this, - REGEX16, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::sinHitHandler2 - ); - - new Handler( - *this, - REGEX16, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::sinUTF16LEHitHandler2 - ); - - // FIXME: leading context - // FIXME: trailing context - /* Possible BitLocker Recovery Key. */ - const string BITLOCKER("[^\\z30-\\z39]([0-9]{6}-){7}[0-9]{6}[^\\z30-\\z39]"); - - new Handler( - *this, - BITLOCKER, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::bitlockerHitHandler - ); - - const string BITLOCKER_UTF16LE("([^\\z30-\\z39]\\z00|[^\\z00])([0-9]{6}-){7}[0-9]{6}[^\\z30-\\z39]"); - - new Handler( - *this, - BITLOCKER, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::bitlockerUTF16LEHitHandler - ); - } - - void Scanner::initScan(const scanner_params& sp) { - CCN_Recorder = sp.fs.named_feature_recorder("ccn"); - CCN_Track2_Recorder = sp.fs.named_feature_recorder("ccn_track2"); - Telephone_Recorder = sp.fs.named_feature_recorder("telephone"); - Alert_Recorder = sp.fs.get_alert_recorder(); - PII_Recorder = sp.fs.named_feature_recorder("pii"); - SIN_Recorder = sp.fs.named_feature_recorder("sin"); - } - - void Scanner::ccnHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + 1; - const size_t len = hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - pos; - - if (valid_ccn(reinterpret_cast(sp.sbuf.buf)+pos, len)) { - CCN_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::ccnUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + (*(sp.sbuf.buf+hit.Start+1) == '\0' ? 2 : 1); const size_t len = hit.End - pos; - - const string ascii(low_utf16le_to_ascii(sp.sbuf.buf+pos, len)); - if (valid_ccn(ascii.c_str(), ascii.size())) { - CCN_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::ccnTrack2HitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + 1; - const size_t len = hit.End - pos; - - if (valid_ccn(reinterpret_cast(sp.sbuf.buf)+pos, len)) { - CCN_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::ccnTrack2UTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + (*(sp.sbuf.buf+hit.Start+1) == '\0' ? 2 : 1); - const size_t len = hit.End - pos; - - const string ascii(low_utf16le_to_ascii(sp.sbuf.buf+pos, len)); - if (valid_ccn(ascii.c_str(), ascii.size())) { - CCN_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::telephoneHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Telephone_Recorder->write_buf(sp.sbuf, hit.Start+1, hit.End-hit.Start-1); - } - - void Scanner::telephoneUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t start = hit.Start + (*(sp.sbuf.buf + hit.Start + 1) == '\0' ? 2 : 1); - const size_t len = hit.End - start; - - Telephone_Recorder->write_buf(sp.sbuf, start, len); - } - - void Scanner::telephoneTrailingCtxHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Telephone_Recorder->write_buf( - sp.sbuf, - hit.Start+1, - hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - (hit.Start+1) - ); - } - - void Scanner::telephoneTrailingCtxUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Telephone_Recorder->write_buf( - sp.sbuf, - hit.Start+1, - hit.End - (*(sp.sbuf.buf+hit.End-3) == '.' ? 3 : 1) -(hit.Start+1) - ); - } - - void Scanner::validatedTelephoneHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + 1; - const size_t len = hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - pos; - if (valid_phone(sp.sbuf, pos, len)){ - if (!is_pdf_box(sp.sbuf, pos)) { - Telephone_Recorder->write_buf(sp.sbuf, pos, len); - } - } - } - - void Scanner::validatedTelephoneUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + 1; - const size_t len = hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - pos; - if (valid_phone_utf16le(sp.sbuf, pos, len)){ - Telephone_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::bitlockerHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Alert_Recorder->write(sp.sbuf.pos0 + hit.Start + 1, reinterpret_cast(sp.sbuf.buf) + 1, "Possible BitLocker Recovery Key (ASCII)."); - } - - void Scanner::bitlockerUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + (*(sp.sbuf.buf + hit.Start + 1) == '\0' ? 2 : 1); - const size_t len = (hit.End - 1) - pos; - - Alert_Recorder->write(sp.sbuf.pos0 + pos, low_utf16le_to_ascii(sp.sbuf.buf + pos, len), "Possible BitLocker Recovery Key (UTF-16)."); - } - - void Scanner::piiHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - PII_Recorder->write_buf( - sp.sbuf, hit.Start, - hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - hit.Start - ); - } - - void Scanner::piiUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - PII_Recorder->write_buf( - sp.sbuf, hit.Start, - hit.End - (*(sp.sbuf.buf+hit.End-3) == '.' ? 3 : 1) - hit.Start - ); - } - - void Scanner::sinHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - SIN_Recorder->write_buf( - sp.sbuf, hit.Start, - hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - hit.Start - ); - } - - void Scanner::sinUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - SIN_Recorder->write_buf( - sp.sbuf, hit.Start, - hit.End - (*(sp.sbuf.buf+hit.End-3) == '.' ? 3 : 1) - hit.Start - ); - } - - void Scanner::sinHitHandler2(const LG_SearchHit& hit, const scanner_params& sp) { - SIN_Recorder->write_buf( - sp.sbuf, hit.Start+1, - hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - hit.Start - ); - } - - void Scanner::sinUTF16LEHitHandler2(const LG_SearchHit& hit, const scanner_params& sp) { - SIN_Recorder->write_buf( - sp.sbuf, hit.Start+1, - hit.End - (*(sp.sbuf.buf+hit.End-3) == '.' ? 3 : 1) - hit.Start - ); - } - - void Scanner::dateHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - PII_Recorder->write_buf(sp.sbuf, hit.Start, hit.End - hit.Start); - } - - Scanner TheScanner; -} - -extern "C" -void scan_accts_lg(struct scanner_params &sp) { - scan_lg(accts::TheScanner, sp, rcb); -} - -#endif // HAVE_LIBLIGHTGREP diff --git a/src/scan_base16_lg.cpp b/src/scan_base16_lg.cpp deleted file mode 100644 index 75ce4a9d..00000000 --- a/src/scan_base16_lg.cpp +++ /dev/null @@ -1,220 +0,0 @@ -#include "config.h" - -// if liblightgrep isn't present, compiles to nothing -#ifdef HAVE_LIBLIGHTGREP - -#include - -#include "be20_api/scanner_params.h" -#include "histogram.h" -#include "pattern_scanner.h" - -namespace base16 { -// const char* const DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; - const char* const DefaultEncodingsCStrings[] = {"UTF-8"}; - - const vector DefaultEncodings( - DefaultEncodingsCStrings, - DefaultEncodingsCStrings + - sizeof(DefaultEncodingsCStrings)/sizeof(DefaultEncodingsCStrings[0]) - ); - - const LG_KeyOptions DefaultOptions = { 0, 1 }; // patterns, case-insensitive - - // - // the scanner - // - - class Scanner: public PatternScanner { - public: - Scanner(): PatternScanner("base16_lg"), Recorder(0) {} - virtual ~Scanner() {} - - virtual Scanner* clone() const { return new Scanner(*this); } - - virtual void startup(const scanner_params& sp); - virtual void init(const scanner_params& sp); - virtual void initScan(const scanner_params&); - - feature_recorder &Recorder; - - void hitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - decode(sp.sbuf, hit.Start, hit.End - hit.Start, sp, rcb); - } - - private: - Scanner(const Scanner& s): - PatternScanner(s), - Recorder(s.Recorder) - {} - - Scanner& operator=(const Scanner&); - - void decode(const sbuf_t& osbuf, size_t pos, size_t len, const scanner_params& sp); - }; - - const uint16_t BASE16_LSN[256] = { - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 256, 256, 256, 256, 256, 256, - 256, 10, 11, 12, 13, 14, 15, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 10, 11, 12, 13, 14, 15, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256 - }; - - const uint16_t BASE16_MSN[256] = { - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 0, 16, 32, 48, 64, 80, 96, 112, - 128, 144, 256, 256, 256, 256, 256, 256, - 256, 160, 176, 192, 208, 224, 240, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 160, 176, 192, 208, 224, 240, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 556, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256 - }; - - void Scanner::startup(const scanner_params& sp) { - sp.check_version(); - - sp.info.set_name("base16_lg"); - sp.info->name = "base16_lg"; - sp.info->author = "Simson L. Garfinkel"; - sp.info->description = "Base16 (hex) scanner"; - sp.info->scanner_version = "1.0"; - sp.info->flags = scanner_info::SCANNER_RECURSE; - sp.info->feature_names.insert("hex"); // notable hex values - } - - void Scanner::init(const scanner_params& sp) { - // - // patterns - // - - /* - * a hex string - * {0,2} means we have 0-2 space characters - * {6,} means minimum of 6 hex bytes - */ - const std::string HEX("[0-9A-F]{2}(([ \\n]|\\r\\n){0,2}[0-9A-F]{2}){5,}"); - - new Handler( - *this, HEX, DefaultEncodings, DefaultOptions, &Scanner::hitHandler - ); - } - - void Scanner::initScan(const scanner_params& sp) { - Recorder = sp.fs.named_feature_recorder("hex"); - } - - // Don't re-analyze hex bufs smaller than this - const unsigned int opt_min_hex_buf = 64; - - size_t base16_decode_skipping_invalid(uint8_t* dst_start, const uint8_t* src, const uint8_t* src_end) { - uint8_t* dst = dst_start; - uint16_t byte; - uint8_t msn, lsn; - - while (src < src_end) { - msn = *src++; - lsn = *src++; - byte = BASE16_MSN[msn] | BASE16_LSN[lsn]; - if (byte < 0x100) { - *dst++ = static_cast(byte); - } - else { - // A "byte" value over FF means we've hit something invalid. The - // pattern requires that hex digits come in pairs, so the first - // character is invalid. Just advance one byte (== backing up one - // byte now, since we've already gone ahead two bytes). - --src; - } - } - - return dst - dst_start; - } - - void Scanner::decode(const sbuf_t& osbuf, size_t pos, size_t len, const scanner_params& sp) { - sbuf_t sbuf(osbuf, pos, len); // the substring we are working with - - TODO: Replace managed_malloc with a sbuf_t::sbuf_malloc - managed_malloc b(sbuf.pagesize/2); - if (b.buf == 0) return; - - const size_t p = base16_decode_skipping_invalid( - b.buf, sbuf.buf, sbuf.buf+sbuf.pagesize - ); - - // Alert on byte sequences of 48, 128 or 256 bits - if (p == 48/8 || p == 128/8 || p == 256/8) { - // it validates; write original with context - Recorder->write_buf(osbuf, pos, len); - return; // Small keys don't get recursively analyzed - } - - if (p > opt_min_hex_buf) { - // NB: we manually add BASE16 here when recursing, because - // rcb.partName is LIGHTGREP here, which is not useful. - sbuf_t nsbuf(osbuf.pos0 + pos + "BASE16", b.buf, p, p, false); - (*rcb.callback)(scanner_params(sp, nsbuf)); // recurse - } - } - - Scanner TheScanner; -} - -extern "C" -void scan_base16_lg(struct scanner_params &sp) { - scan_lg(base16::TheScanner, sp, rcb); -} - -#endif // HAVE_LIBLIGHTGREP diff --git a/src/scan_email_lg.cpp b/src/scan_email_lg.cpp deleted file mode 100644 index 84db124f..00000000 --- a/src/scan_email_lg.cpp +++ /dev/null @@ -1,511 +0,0 @@ -#include "config.h" - -// if liblightgrep isn't present, compiles to nothing -#ifdef HAVE_LIBLIGHTGREP - -#include -#include -#include - -#include "be20_api/scanner_params.h" - -#include "histogram.h" -#include "pattern_scanner.h" -#include "pattern_scanner_utils.h" -#include "utils.h" // needs config.h - -using namespace std; - -namespace email { - const char* const DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; - - const vector DefaultEncodings( - DefaultEncodingsCStrings, - DefaultEncodingsCStrings + - sizeof(DefaultEncodingsCStrings)/sizeof(DefaultEncodingsCStrings[0]) - ); - - const vector OnlyUTF8Encoding(1, "UTF-8"); - - const vector OnlyUTF16LEEncoding(1, "UTF-16LE"); - - const LG_KeyOptions DefaultOptions = { 0, 1 }; // patterns, case-insensitive - - // - // subpatterns - // - - const string INUM("(1?[0-9]{1,2}|2([0-4][0-9]|5[0-5]))"); - const string HEX("[0-9a-f]"); - const string ALNUM("[a-zA-Z0-9]"); - - const string PC("[\\x20-\\x7E]"); - - const string TLD("(AC|AD|AE|AERO|AF|AG|AI|AL|AM|AN|AO|AQ|AR|ARPA|AS|ASIA|AT|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BIZ|BJ|BL|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CAT|CC|CD|CF|CG|CH|CI|CK|CL|CM|CN|CO|COM|COOP|CR|CU|CV|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EDU|EE|EG|EH|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE|GF|GG|GH|GI|GL|GM|GN|GOV|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|INFO|INT|IO|IQ|IR|IS|IT|JE|JM|JO|JOBS|JP|KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MF|MG|MH|MIL|MK|ML|MM|MN|MO|MOBI|MP|MQ|MR|MS|MT|MU|MUSEUM|MV|MW|MX|MY|MZ|NA|NAME|NC|NE|NET|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|ORG|PA|PE|PF|PG|PH|PK|PL|PM|PN|PR|PRO|PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SY|SZ|TC|TD|TEL|TF|TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TRAVEL|TT|TV|TW|TZ|UA|UG|UK|UM|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|YE|YT|YU|ZA|ZM|ZW)"); - - const string YEAR("(19[6-9][0-9]|20[0-1][0-9])"); - const string DAYOFWEEK("(Mon|Tue|Wed|Thu|Fri|Sat|Sun)"); - const string MONTH("(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"); - const string ABBREV("(UTC?|GMT|EST|EDT|CST|CDT|MST|MDT|PST|PDT|[ZAMNY])"); - - // - // helper functions - // - - // NB: It is very important *not* to use functions expecting C strings - // or std::strings on hit data, as hit data could contain internal null - // bytes. - - /** return the offset of the domain in an email address. - * returns buflen + 1 if the domain is not found. - * the domain extends to the end of the email address - */ - inline size_t find_domain_in_email(const uint8_t* buf, size_t buflen) { - return find(buf, buf + buflen, '@') - buf + 1; - } - - template - inline size_t find_domain_in_url(const T* buf, size_t buflen, size_t& domain_len) { - const T* dbeg = search_n(buf, buf + buflen, 2, '/') + 2; - if (dbeg < buf + buflen) { - const T stop[] = { '/', ':' }; - const T* dend = find_first_of(dbeg, buf + buflen, stop, stop + 2); - domain_len = dend - dbeg; - return dbeg - buf; - } - - return buflen; - } - - bool valid_ether_addr(const uint8_t* buf) { - if (memcmp((const uint8_t *)"00:00:00:00:00:00", buf, 17) == 0) { - return false; - } - - if (memcmp((const uint8_t *)"00:11:22:33:44:55", buf, 17) == 0) { - return false; - } - - /* Perform a quick histogram analysis. - * For each group of characters, create a value based on the two digits. - * There is no need to convert them to their 'actual' value. - * Don't accept a histogram that has 3 values. That could be - * 11:11:11:11:22:33 - * Require 4, 5 or 6. - * If we have 4 or more distinct values, then treat it good. - * Otherwise its is some pattern we don't want. - */ - set ctr; - for (uint32_t i = 0; i < 6; ++i) { // loop over each group - // create a unique value of the two characters - ctr.insert((buf[i*3] << 8) + buf[i*3+1]); - } - - return ctr.size() >= 4; - } - - template - bool valid_ipaddr(const T* leftguard, const T* hit) { - // copy up to 'window' preceding Ts into context array - static const ssize_t window = 8; - T context[window] = { ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' }; - const ssize_t diff = min(hit - leftguard, window); - copy(hit - diff, hit, context + window - diff); - - if ( - isalnum(context[7]) || - context[7] == '.' || - context[7] == '-' || - context[7] == '+' || - (ishexnumber(context[4]) && ishexnumber(context[5]) && - ishexnumber(context[6]) && context[7] == '}') || - (*hit == '0' && *(hit + 1) == '.')) - { - // ignore - return false; - } - - static const struct { - size_t pos; - const char* str; - } checks[] = { - { 5, "v." }, - { 5, "v " }, - { 5, "rv:" }, // rv:1.9.2.8 as in Mozilla - { 4, ">=" }, // >= 1.8.0.10 - { 4, "<=" }, // <= 1.8.0.10 - { 4, "<<" }, // << 1.8.0.10 - { 4, "ver" }, - { 4, "Ver" }, - { 4, "VER" }, - { 0, "rsion" }, - { 0, "ion=" }, - { 0, "PSW/" }, // PWS/1.5.19.3 ... - { 0, "flash=" }, // flash= - { 0, "stone=" }, // Milestone= - { 4, "NSS" }, - { 0, "/2001," }, // /2001,3.60.50.8 - { 0, "TI_SZ" } // %REG_MULTI_SZ%, - }; - - for (size_t i = 0; i < sizeof(checks)/sizeof(checks[0]); ++i) { - if (search( - context + checks[i].pos, - context + 8, checks[i].str, - checks[i].str + strlen(checks[i].str) - ) != context + 8) { - return false; - } - } - - return true; - } - - // - // the scanner - // - - class Scanner: public PatternScanner { - public: - Scanner(): PatternScanner("email_lg"), RFC822_Recorder(0), Email_Recorder(0), Domain_Recorder(0), Ether_Recorder(0), URL_Recorder(0) {} - virtual ~Scanner() {} - - virtual Scanner* clone() const { return new Scanner(*this); } - - virtual void startup(const scanner_params& sp); - virtual void init(const scanner_params& sp); - virtual void initScan(const scanner_params& sp); - - feature_recorder* RFC822_Recorder; - feature_recorder* Email_Recorder; - feature_recorder* Domain_Recorder; - feature_recorder* Ether_Recorder; - feature_recorder* URL_Recorder; - - void rfc822HitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void emailHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void emailUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void ipaddrHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void ipaddrUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void etherHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void etherUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void protoHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void protoUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - private: - Scanner(const Scanner& s): - PatternScanner(s), - RFC822_Recorder(s.RFC822_Recorder), - Email_Recorder(s.Email_Recorder), - Domain_Recorder(s.Domain_Recorder), - Ether_Recorder(s.Ether_Recorder), - URL_Recorder(s.URL_Recorder) - {} - - Scanner& operator=(const Scanner&); - }; - - void Scanner::startup(const scanner_params& sp) { - sp.check_version(); - - sp.info->name = "email_lg"; - sp.info->author = "Simson L. Garfinkel"; - sp.info->description = "Scans for email addresses, domains, URLs, RFC822 headers, etc."; - sp.info->scanner_version = "1.0"; - - // define the feature files this scanner creates - sp.info->feature_names.insert("email"); - sp.info->feature_names.insert("domain"); - sp.info->feature_names.insert("url"); - sp.info->feature_names.insert("rfc822"); - sp.info->feature_names.insert("ether"); - - // define the histograms to make - sp.info->histogram_defs.insert(histogram_def("email", "", "histogram", HistogramMaker::FLAG_LOWERCASE)); - sp.info->histogram_defs.insert(histogram_def("domain", "", "histogram")); - sp.info->histogram_defs.insert(histogram_def("url", "", "histogram")); - sp.info->histogram_defs.insert(histogram_def("url", "://([^/]+)", "services")); - sp.info->histogram_defs.insert(histogram_def("url", "://((cid-[0-9a-f])+[a-z.].live.com/)", "microsoft-live")); - sp.info->histogram_defs.insert(histogram_def("url", "://[-_a-z0-9.]+facebook.com/.*[&?]{1}id=([0-9]+)", "facebook-id")); - sp.info->histogram_defs.insert(histogram_def("url", "://[-_a-z0-9.]+facebook.com/([a-zA-Z0-9.]*[^/?&]$)", "facebook-address", HistogramMaker::FLAG_LOWERCASE)); - sp.info->histogram_defs.insert(histogram_def("url", "search.*[?&/;fF][pq]=([^&/]+)", "searches")); - } - - void Scanner::init(const scanner_params& sp) { - // - // patterns - // - - const string DATE(DAYOFWEEK + ",[ \\t\\n\\r]+[0-9]{1,2}[ \\t\\n\\r]+" + MONTH + "[ \\t\\n\\r]+" + YEAR + "[ \\t\\n\\r]+[0-2][0-9]:[0-5][0-9]:[0-5][0-9][ \\t\\n\\r]+([+-][0-2][0-9][0314][05]|" + ABBREV + ")"); - - new Handler( - *this, - DATE, - DefaultEncodings, - DefaultOptions, - &Scanner::rfc822HitHandler - ); - - const string MESSAGE_ID("Message-ID:([ \\t\\n]|\\r\\n)?<" + PC + "+>"); - - new Handler( - *this, - MESSAGE_ID, - DefaultEncodings, - DefaultOptions, - &Scanner::rfc822HitHandler - ); - - const string SUBJECT("Subject:[ \\t]?" + PC + "+"); - - new Handler( - *this, - SUBJECT, - DefaultEncodings, - DefaultOptions, - &Scanner::rfc822HitHandler - ); - - const string COOKIE("Cookie:[ \\t]?" + PC + "+"); - - new Handler( - *this, - COOKIE, - DefaultEncodings, - DefaultOptions, - &Scanner::rfc822HitHandler - ); - - const string HOST("Host:[ \\t]?[a-zA-Z0-9._]+"); - - new Handler( - *this, - HOST, - DefaultEncodings, - DefaultOptions, - &Scanner::rfc822HitHandler - ); - - // FIXME: trailing context -// const string EMAIL(ALNUM + "[a-zA-Z0-9._%\\-+]+" + ALNUM + "@" + ALNUM + "[a-zA-Z0-9._%\\-]+\\." + TLD + "[^\\z41-\\z5A\\z61-\\z7A]"); - const string EMAIL(ALNUM + "(\\.?[a-zA-Z0-9_%\\-+])+\\.?" + ALNUM + "@" + ALNUM + "(\\.?[a-zA-Z0-9_%\\-])+\\." + TLD + "[^\\z41-\\z5A\\z61-\\z7A]"); - - new Handler( - *this, - EMAIL, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::emailHitHandler - ); - - new Handler( - *this, - EMAIL, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::emailUTF16LEHitHandler - ); - - // FIXME: leading context - // FIXME: trailing context - // Numeric IP addresses. Get the context before and throw away some things - const string IP("[^\\z30-\\z39\\z2E]" + INUM + "(\\." + INUM + "){3}[^\\z30-\\z39\\z2B\\z2D\\z2E\\z41-\\z5A\\z5F\\z61-\\z7A]"); - - new Handler( - *this, - IP, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ipaddrHitHandler - ); - - const string IP_UTF16LE("([^\\z30-\\z39\\z2E]\\z00|[^\\z00])" + INUM + "(\\." + INUM + "){3}[^\\z30-\\z39\\z2B\\z2D\\z2E\\z41-\\z5A\\z5F\\z61-\\z7A]"); - - new Handler( - *this, - IP_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ipaddrUTF16LEHitHandler - ); - - // FIXME: leading context - // FIXME: trailing context - // found a possible MAC address! - const string MAC("[^\\z30-\\z39\\z3A\\z41-\\z5A\\z61-\\z7A]" + HEX + "{2}(:" + HEX + "{2}){5}[^\\z30-\\z39\\z3A\\z41-\\z5A\\z61-\\z7A]"); - - new Handler( - *this, - MAC, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::etherHitHandler - ); - - const string MAC_UTF16LE("([^\\z30-\\z39\\z3A\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])" + HEX + "{2}(:" + HEX + "{2}){5}[^\\z30-\\z39\\z3A\\z41-\\z5A\\z61-\\z7A]"); - - new Handler( - *this, - MAC, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::etherUTF16LEHitHandler - ); - - const string PROTO("(https?|afp|smb)://[a-zA-Z0-9_%/\\-+@:=&?#~.;]+"); - - new Handler( - *this, - PROTO, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::protoHitHandler - ); - - new Handler( - *this, - PROTO, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::protoUTF16LEHitHandler - ); - } - - void Scanner::initScan(const scanner_params& sp) { - RFC822_Recorder = sp.named_feature_recorder("rfc822"); - Email_Recorder = sp.named_feature_recorder("email"); - Domain_Recorder = sp.named_feature_recorder("domain"); - Ether_Recorder = sp.named_feature_recorder("ether"); - URL_Recorder = sp.named_feature_recorder("url"); - } - - void Scanner::rfc822HitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - RFC822_Recorder->write_buf(sp.sbuf, hit.Start, hit.End - hit.Start); - } - - void Scanner::emailHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t len = (hit.End - 1) - hit.Start; - const uint8_t* matchStart = sp.sbuf.buf + hit.Start; - - Email_Recorder->write_buf(sp.sbuf, hit.Start, len); - const size_t domain_off = find_domain_in_email(matchStart, len); - if (domain_off < len) { - Domain_Recorder->write_buf(sp.sbuf, hit.Start + domain_off, len - domain_off); - } - } - - void Scanner::emailUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t len = (hit.End - 1) - hit.Start; - const uint8_t* matchStart = sp.sbuf.buf + hit.Start; - - Email_Recorder->write_buf(sp.sbuf, hit.Start, len); - const size_t domain_off = find_domain_in_email(matchStart, len) + 1; - if (domain_off < len) { - Domain_Recorder->write_buf(sp.sbuf, hit.Start + domain_off, len - domain_off); - } - } - - void Scanner::ipaddrHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - if (valid_ipaddr(sp.sbuf.buf, sp.sbuf.buf + hit.Start + 1)) { - Domain_Recorder->write_buf(sp.sbuf, hit.Start+1, hit.End - hit.Start - 2); - } - } - - void Scanner::ipaddrUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + (*(sp.sbuf.buf+hit.Start+1) == '\0' ? 2 : 1); - const size_t len = (hit.End - 1) - pos; - // this assumes sp.sbuf.pos will never be an odd memory address... - // if pos is odd, add 1 to sbuf.buf and use it as a leftmost guard - const uint16_t* leftguard(reinterpret_cast(sp.sbuf.buf + ((pos & 0x01) == 1 ? 1: 0))); - if (valid_ipaddr(leftguard, reinterpret_cast(sp.sbuf.buf + pos))) { - Domain_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::etherHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + 1; - const size_t len = (hit.End - 1) - pos; - if (valid_ether_addr(sp.sbuf.buf+pos)){ - Ether_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::etherUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + (*(sp.sbuf.buf+hit.Start+1) == '\0' ? 2 : 1); - const size_t len = (hit.End -1) - pos; - - const string ascii(low_utf16le_to_ascii(sp.sbuf.buf+pos, len)); - if (valid_ether_addr(reinterpret_cast(ascii.c_str()))){ - Ether_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::protoHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - // for reasons that aren't clear, there are a lot of net protocols that - // have an http://domain in them followed by numbers. So this counts the - // number of slashes and if it is only 2 the size is pruned until the - // last character is a letter - const int slash_count = count( - sp.sbuf.buf + hit.Start, - sp.sbuf.buf + hit.End, '/' - ); - - size_t len = hit.End - hit.Start; - - if (slash_count == 2) { - while (len > 0 && !isalpha(sp.sbuf[hit.Start+len-1])) { - --len; - } - } - - URL_Recorder->write_buf(sp.sbuf, hit.Start, len); - - size_t domain_len = 0; - size_t domain_off = find_domain_in_url(sp.sbuf.buf + hit.Start, len, domain_len); // find the start of domain? - if (domain_off < len && domain_len > 0) { - Domain_Recorder->write_buf(sp.sbuf, hit.Start + domain_off, domain_len); - } - } - - void Scanner::protoUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const int slash_count = count( - sp.sbuf.buf + hit.Start, - sp.sbuf.buf + hit.End, '/' - ); - - size_t len = hit.End - hit.Start; - - if (slash_count == 2) { - while (len > 1 && !isalpha(sp.sbuf[hit.Start+len-2])) { - len -= 2; - } - } - - URL_Recorder->write_buf(sp.sbuf, hit.Start, len); - - size_t domain_len = 0; - size_t domain_off = find_domain_in_url(reinterpret_cast(sp.sbuf.buf + hit.Start), len/2, domain_len); // find the start of domain? - domain_off *= 2; - domain_len *= 2; - if (domain_off < len && domain_len > 0) { - Domain_Recorder->write_buf(sp.sbuf, hit.Start + domain_off, domain_len); - } - } - - Scanner TheScanner; -} - -extern "C" -void scan_email_lg(struct scanner_params &sp) { - scan_lg(email::TheScanner, sp, rcb); -} - -#endif // HAVE_LIBLIGHTGREP diff --git a/src/scan_gps_lg.cpp b/src/scan_gps_lg.cpp deleted file mode 100644 index 3bf96af2..00000000 --- a/src/scan_gps_lg.cpp +++ /dev/null @@ -1,212 +0,0 @@ -#include "config.h" - -// if liblightgrep isn't present, compiles to nothing -#ifdef HAVE_LIBLIGHTGREP - -#include - -#include "be20_api/scanner_params.h" - -#include "pattern_scanner.h" - -namespace gps { - const char* const DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; - - const vector DefaultEncodings( - DefaultEncodingsCStrings, - DefaultEncodingsCStrings + - sizeof(DefaultEncodingsCStrings)/sizeof(DefaultEncodingsCStrings[0]) - ); - - const LG_KeyOptions DefaultOptions = { 0, 0 }; // patterns, case-sensitive - - // - // helper functions - // - - /** - * Return NNN in - */ - string get_quoted_attrib(string text, string attrib) { - const size_t pos = text.find(attrib); - if (pos == string::npos) return ""; /* no attrib */ - const ssize_t quote1 = text.find('"', pos); - if (quote1 < 0) return ""; /* no opening quote */ - const ssize_t quote2 = text.find('"', quote1+1); - if (quote2 < 0) return ""; /* no closing quote */ - return text.substr(quote1+1, quote2-(quote1+1)); - } - - /** - * Return NNN in NNN - */ - string get_cdata(string text) { - const ssize_t gt = text.find('>'); - if (gt < 0) return ""; /* no > */ - const ssize_t lt = text.find('<', gt+1); - if (lt < 0) return ""; /* no < */ - return text.substr(gt+1, lt-(gt+1)); - } - - // - // subpatterns - // - - const string LATLON("(-?[0-9]{1,3}\\.[0-9]{6,8})"); - const string ELEV("(-?[0-9]{1,6}\\.[0-9]{0,3})"); - - // - // the scanner - // - - class Scanner: public PatternScanner { - public: - Scanner(): PatternScanner("gps_lg"), Recorder(0), Lat(), Lon(), Ele(), Time(), Speed(), Course() {} - virtual ~Scanner() {} - - virtual Scanner* clone() const { return new Scanner(*this); } - - virtual void startup(const scanner_params& sp); - virtual void init(const scanner_params& sp); - virtual void initScan(const scanner_params&); - - feature_recorder* Recorder {}; - - void trkptHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void eleHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void timeHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void speedHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void courseHandler(const LG_SearchHit& hit, const scanner_params& sp); - - private: - Scanner(const Scanner& s): PatternScanner(s), Recorder(s.Recorder), Lat(s.Lat), Lon(s.Lon), Ele(s.Ele), Time(s.Time), Speed(s.Speed), Course(s.Course) {} - Scanner& operator=(const Scanner&); - - void clear(const scanner_params& sp, size_t pos); - - string Lat, Lon, Ele, Time, Speed, Course; - }; - - void Scanner::startup(const scanner_params& sp) { - sp.check_version(); - - sp.info->name = "gps_lg"; - sp.info->author = "Simson L. Garfinkel"; - sp.info->description = "Garmin Trackpt XML info"; - sp.info->scanner_version = "1.0"; - sp.info->feature_defs.push_back( feature_recorder_def("gps")); - } - - void Scanner::init(const scanner_params& sp) { - // - // patterns - // - - const string TRKPT("" + ELEV + ""); - - new Handler( - *this, - ELE, - DefaultEncodings, - DefaultOptions, - &Scanner::eleHandler - ); - - const string TIME(""); - - new Handler( - *this, - TIME, - DefaultEncodings, - DefaultOptions, - &Scanner::timeHandler - ); - - const string GPXTPX_SPEED("" + ELEV + ""); - - new Handler( - *this, - GPXTPX_SPEED, - DefaultEncodings, - DefaultOptions, - &Scanner::speedHandler - ); - - const string GPXTPX_COURSE("" + ELEV + ""); - - new Handler( - *this, - GPXTPX_COURSE, - DefaultEncodings, - DefaultOptions, - &Scanner::courseHandler - ); - } - - void Scanner::initScan(const scanner_params& sp) { - Recorder = &sp.named_feature_recorder("gps"); - } - - void Scanner::clear(const scanner_params& sp, size_t pos) { - // dump the current and go to the next - if (!Time.empty() || !Lat.empty() || !Lon.empty() || - !Ele.empty() || !Speed.empty() || !Course.empty()) { - const string what = Time + "," + Lat + "," + Lon + "," + - Ele + "," + Speed + "," + Course; - // NB: the pos is the *end* of the "hit" - Recorder->write(sp.sbuf.pos0 + pos, what, ""); - - Time.clear(); - Lat.clear(); - Lon.clear(); - Ele.clear(); - Speed.clear(); - Course.clear(); - } - } - - void Scanner::trkptHandler(const LG_SearchHit& hit, const scanner_params& sp) { - clear(sp, hit.Start); - Lat = get_quoted_attrib(reinterpret_cast(sp.sbuf.buf), "lat"); - Lon = get_quoted_attrib(reinterpret_cast(sp.sbuf.buf), "lon"); - } - - void Scanner::eleHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Ele = get_cdata(reinterpret_cast(sp.sbuf.buf)); - } - - void Scanner::timeHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Time = get_cdata(reinterpret_cast(sp.sbuf.buf)); - } - - void Scanner::speedHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Speed = get_cdata(reinterpret_cast(sp.sbuf.buf)); - } - - void Scanner::courseHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Course = get_cdata(reinterpret_cast(sp.sbuf.buf)); - } - - Scanner TheScanner; -} - -extern "C" -void scan_gps_lg(scanner_params &sp) { - scan_lg(gps::TheScanner, sp, rcb); -} - -#endif // HAVE_LIBLIGHTGREP From 287b882417f0330914739ed71e9cc90d371d8902 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 11 Apr 2023 15:36:13 -0400 Subject: [PATCH 02/31] F - skeleton of new lightgrep scanner for BE2.0 --- src/bulk_extractor_scanners.h | 6 +- src/pattern_scanner.cpp | 582 +++++++++++++++++----------------- src/pattern_scanner.h | 154 ++++----- src/scan_lightgrep.cpp | 47 +-- 4 files changed, 396 insertions(+), 393 deletions(-) diff --git a/src/bulk_extractor_scanners.h b/src/bulk_extractor_scanners.h index 1ac62e1b..cb612273 100644 --- a/src/bulk_extractor_scanners.h +++ b/src/bulk_extractor_scanners.h @@ -64,9 +64,5 @@ SCANNER(zip) #ifdef HAVE_LIBLIGHTGREP -//SCANNER(accts_lg) -//SCANNER(base16_lg) -//SCANNER(email_lg) -//SCANNER(gps_lg) -//SCANNER(lightgrep) +SCANNER(lightgrep) #endif diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index 0f188128..07220fd2 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -1,307 +1,307 @@ #include "config.h" -// if liblightgrep isn't present, compiles to nothing +// // if liblightgrep isn't present, compiles to nothing #ifdef HAVE_LIBLIGHTGREP -#include "beregex.h" -#include "histogram.h" +// // #include "beregex.h" +// #include "be20_api/histogram_def.h" #include "pattern_scanner.h" -#include +// #include -#include -#include -#include -#include +// #include +// #include +// #include +// #include -#include +// #include -#ifdef LGBENCHMARK -#include -#endif +// #ifdef LGBENCHMARK +// #include +// #endif -namespace { - const char* DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; - const unsigned int NumDefaultEncodings = 2; -} +// namespace { +// const char* DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; +// const unsigned int NumDefaultEncodings = 2; +// } -bool PatternScanner::handleParseError(const Handler& h, LG_Error* err) const { - cerr << "Parse error on '" << h.RE << "' in " << Name - << ": " << err->Message << endl; - return false; -} +// bool PatternScanner::handleParseError(const Handler& h, LG_Error* err) const { +// cerr << "Parse error on '" << h.RE << "' in " << Name +// << ": " << err->Message << endl; +// return false; +// } void PatternScanner::shutdown(const scanner_params&) { - for (vector::iterator itr(Handlers.begin()); itr != Handlers.end(); ++itr) { - delete *itr; - } -} -/*********************************************************/ - -LightgrepController::LightgrepController() -: ParsedPattern(lg_create_pattern()), // Reuse the parsed pattern data structure for efficiency - Fsm(lg_create_fsm(1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed - PatternInfo(lg_create_pattern_map(1000)), // Reserve space for 1000 patterns in the pattern map - Prog(0), - Scanners() -{ -} - -LightgrepController::~LightgrepController() { - lg_destroy_pattern(ParsedPattern); - lg_destroy_pattern_map(PatternInfo); - lg_destroy_program(Prog); -} - -LightgrepController& LightgrepController::Get() { - // Meyers Singleton. c.f. Effective C++ by Scott Meyers - static LightgrepController controller; - return controller; -} - -bool LightgrepController::addScanner(PatternScanner& scanner) { - // Add patterns and handlers from a Scanner to the centralized automaton - LG_Error* lgErr = 0; - - unsigned int patBegin = numeric_limits::max(), - patEnd = 0; - - int idx = -1; - - // iterate all the scanner's handlers - for (vector::const_iterator h(scanner.handlers().begin()); h != scanner.handlers().end(); ++h) { - bool good = false; - if (lg_parse_pattern(ParsedPattern, (*h)->RE.c_str(), &(*h)->Options, &lgErr)) { // parse the pattern - for (vector::const_iterator enc((*h)->Encodings.begin()); enc != (*h)->Encodings.end(); ++enc) { - idx = lg_add_pattern(Fsm, PatternInfo, ParsedPattern, enc->c_str(), &lgErr); // add the pattern for each given encoding - if (idx >= 0) { - // add the handler callback to the pattern map, associated with the pattern index - lg_pattern_info(PatternInfo, idx)->UserData = const_cast(static_cast(&((*h)->Callback))); - patBegin = std::min(patBegin, static_cast(idx)); - good = true; - } - } - -// std::cerr << '\t' << (int)((*h)->Options.FixedString) << '\t' << (int)((*h)->Options.CaseInsensitive) << std::endl; - } - if (!good) { - if (scanner.handleParseError(**h, lgErr)) { - lg_free_error(lgErr); - lgErr = 0; - } - else { - return false; - } - } - } - patEnd = lg_pattern_map_size(PatternInfo); - // record the range of this scanner's patterns in the central pattern map - scanner.patternRange() = make_pair(patBegin, patEnd); - Scanners.push_back(&scanner); - return true; -} - -/* note: findopts is now part of scanner_set.scanner_config, you need to pass that in here. */ -bool LightgrepController::addUserPatterns(PatternScanner& scanner, CallbackFnType* callbackPtr, const FindOpts& user) { - // Add patterns specified as keywords by the user - // Similar to above, but does not have a handler per pattern - unsigned int patBegin = lg_pattern_map_size(PatternInfo), - patEnd = 0; - - LG_KeyOptions opts; - opts.FixedString = 0; - opts.CaseInsensitive = 0; - - LG_Error *err = 0; - - // Add patterns from files - for (vector::const_iterator itr(user.Files.begin()); itr != user.Files.end(); ++itr) { - ifstream file(itr->c_str(), ios::in); - if (!file.is_open()) { - cerr << "Could not open pattern file '" << *itr << "'." << endl; - return false; - } - string contents = string(istreambuf_iterator(file), istreambuf_iterator()); - - const char* contentsCStr = contents.c_str(); - // Add all the patterns from the files in one fell swoop - if (lg_add_pattern_list(Fsm, PatternInfo, contentsCStr, itr->c_str(), DefaultEncodingsCStrings, 2, &opts, &err) < 0) { - vector lines; - istringstream input(contents); - string line; - while (input) { - getline(input, line); - lines.push_back(line); - } - LG_Error* cur(err); - while (cur) { - cerr << "Error in " << *itr << ", line " << cur->Index+1 << ", pattern '" << lines[cur->Index] - << "': " << cur->Message << endl; - cur = cur->Next; - } - lg_free_error(err); - return false; - } - } - // add patterns from single command-line arguments - for (vector::const_iterator itr(user.Patterns.begin()); itr != user.Patterns.end(); ++itr) { - bool good = false; - if (lg_parse_pattern(ParsedPattern, itr->c_str(), &opts, &err)) { - for (unsigned int i = 0; i < NumDefaultEncodings; ++i) { - if (lg_add_pattern(Fsm, PatternInfo, ParsedPattern, DefaultEncodingsCStrings[i], &err) >= 0) { - good = true; - } - } - } - if (!good) { - cerr << "Error on '" << *itr << "': " << err->Message << endl; - lg_free_error(err); - return false; - } - } - patEnd = lg_pattern_map_size(PatternInfo); - for (unsigned int i = patBegin; i < patEnd; ++i) { - lg_pattern_info(PatternInfo, i)->UserData = const_cast(static_cast(callbackPtr)); - } - scanner.patternRange() = make_pair(patBegin, patEnd); - Scanners.push_back(&scanner); - return true; -} - -void LightgrepController::regcomp() { - LG_ProgramOptions progOpts; - progOpts.Determinize = 1; - // Create an optimized, immutable form of the accumulated automaton - Prog = lg_create_program(Fsm, &progOpts); - lg_destroy_fsm(Fsm); - - cerr << lg_pattern_map_size(PatternInfo) << " lightgrep patterns, logic size is " << lg_program_size(Prog) << " bytes, " << Scanners.size() << " active scanners" << std::endl; - #ifdef LGBENCHMARK - cerr << "timer second ratio " << chrono::high_resolution_clock::period::num << "/" << - chrono::high_resolution_clock::period::den << endl; - #endif -} - -struct HitData { - // Everything we need for processing a hit - LightgrepController* lgc; - const vector* scannerTable; - const scanner_params* sp; - //const recursion_control_block* rcb; -}; - -void gotHit(void* userData, const LG_SearchHit* hit) { - #ifdef LGBENCHMARK - // no callback, just increment hit counter - ++(*static_cast(userData)); - #else - // trampoline back into LightgrepController::processHit() from the void* userData - HitData* hd(static_cast(userData)); - hd->lgc->processHit(*hd->scannerTable, *hit, *hd->sp, *hd->rcb); - #endif -} - -void LightgrepController::scan(const scanner_params& sp, const recursion_control_block &rcb) { - // Scan the sbuf for pattern hits, invoking various scanners' handlers as hits are encountered - if (!Prog) { - // we had no valid patterns, do nothing - return; - } - // First, clone all the scanners so that there's no shared data between threads - vector scannerTable(lg_pattern_map_size(PatternInfo)); // [Keyword Index -> scanner], no ownership - vector scannerList; // ownership list - for (vector::const_iterator itr(Scanners.begin()); itr != Scanners.end(); ++itr) { - PatternScanner *s = (*itr)->clone(); - scannerList.push_back(s); - for (unsigned int i = s->patternRange().first; i < s->patternRange().second; ++i) { - scannerTable[i] = s; - } - s->initScan(sp); // let the scanner know we're about to scan an sbuf - } - LG_ContextOptions ctxOpts; - ctxOpts.TraceBegin = 0xffffffffffffffff; - ctxOpts.TraceEnd = 0; - - LG_HCONTEXT ctx = lg_create_context(Prog, &ctxOpts); // create a search context; cannot be shared, so local to scan - - const sbuf_t &sbuf = sp.sbuf; - - HitData callbackInfo = { this, &scannerTable, &sp, &rcb }; - void* userData = &callbackInfo; - - #ifdef LGBENCHMARK // perform timings of lightgrep search functions only -- no callbacks - uint64_t hitCount = 0; - userData = &hitCount; // switch things out for a counter - - auto startClock = std::chrono::high_resolution_clock::now(); - // std::cout << "Starting block " << sbuf.pos0.str() << std::endl; - #endif - - // search the sbuf in one go - // the gotHit() function will be invoked for each pattern hit - if (lg_search(ctx, (const char*)sbuf.buf, (const char*)sbuf.buf + sbuf.pagesize, 0, userData, gotHit) < numeric_limits::max()) { - // resolve potential hits that want data into the sbuf margin, without beginning any new hits - lg_search_resolve(ctx, (const char*)sbuf.buf + sbuf.pagesize, (const char*)sbuf.buf + sbuf.bufsize, sbuf.pagesize, userData, gotHit); - } - // flush any remaining hits; there's no more data - lg_closeout_search(ctx, userData, gotHit); - - #ifdef LGBENCHMARK - auto endClock = std::chrono::high_resolution_clock::now(); - auto t = endClock - startClock; - double seconds = double(t.count() * chrono::high_resolution_clock::period::num) / chrono::high_resolution_clock::period::den; - double bw = double(sbuf.pagesize) / (seconds * 1024 * 1024); - std::stringstream buf; - buf << " ** Time: " << sbuf.pos0.str() << '\t' << sbuf.pagesize << '\t' << t.count() << '\t' << seconds<< '\t' << hitCount << '\t' << bw << std::endl; - std::cout << buf.str(); -// std::cout.flush(); - #endif - - lg_destroy_context(ctx); - - // don't call PatternScanner::shutdown() on these! that only happens on prototypes - for (vector::const_iterator itr(scannerList.begin()); itr != scannerList.end(); ++itr) { - (*itr)->finishScan(sp); // let the scanner know we're done with the sbuf - delete *itr; - } -} - -void LightgrepController::processHit(const vector& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb) { - // lookup the handler's callback functor in the pattern map, then invoke it - CallbackFnType* cbPtr(static_cast(lg_pattern_info(PatternInfo, hit.KeywordIndex)->UserData)); - ((*sTbl[hit.KeywordIndex]).*(*cbPtr))(hit, sp, rcb); // ...yep... -} - -unsigned int LightgrepController::numPatterns() const { - return lg_pattern_map_size(PatternInfo); + // for (vector::iterator itr(Handlers.begin()); itr != Handlers.end(); ++itr) { + // delete *itr; + // } } - -/*********************************************************/ - -void scan_lg(PatternScanner& scanner, class scanner_params &sp) { - // utility implementation of the normal scan function for a PatternScanner instance - switch (sp.phase) { - case scanner_params::PHASE_STARTUP: - scanner.startup(sp); - break; - case scanner_params::PHASE_INIT: - scanner.init(sp); - if (!LightgrepController::Get().addScanner(scanner)) { - // It's fine for user patterns not to parse, but there's no excuse for a scanner so exit. - cerr << "Aborting. Fix pattern or disable scanner to continue." << endl; - exit(EXIT_FAILURE); - } - break; - case scanner_params::PHASE_SHUTDOWN: - scanner.shutdown(sp); - break; - case scanner_params::PHASE_CLEANUP: - TODO - to something here. - default: - break; - } -} - -/*********************************************************/ +// /*********************************************************/ + +// LightgrepController::LightgrepController() +// : ParsedPattern(lg_create_pattern()), // Reuse the parsed pattern data structure for efficiency +// Fsm(lg_create_fsm(1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed +// PatternInfo(lg_create_pattern_map(1000)), // Reserve space for 1000 patterns in the pattern map +// Prog(0), +// Scanners() +// { +// } + +// LightgrepController::~LightgrepController() { +// lg_destroy_pattern(ParsedPattern); +// lg_destroy_pattern_map(PatternInfo); +// lg_destroy_program(Prog); +// } + +// LightgrepController& LightgrepController::Get() { +// // Meyers Singleton. c.f. Effective C++ by Scott Meyers +// static LightgrepController controller; +// return controller; +// } + +// bool LightgrepController::addScanner(PatternScanner& scanner) { +// // Add patterns and handlers from a Scanner to the centralized automaton +// LG_Error* lgErr = 0; + +// unsigned int patBegin = numeric_limits::max(), +// patEnd = 0; + +// int idx = -1; + +// // iterate all the scanner's handlers +// for (vector::const_iterator h(scanner.handlers().begin()); h != scanner.handlers().end(); ++h) { +// bool good = false; +// if (lg_parse_pattern(ParsedPattern, (*h)->RE.c_str(), &(*h)->Options, &lgErr)) { // parse the pattern +// for (vector::const_iterator enc((*h)->Encodings.begin()); enc != (*h)->Encodings.end(); ++enc) { +// idx = lg_add_pattern(Fsm, PatternInfo, ParsedPattern, enc->c_str(), &lgErr); // add the pattern for each given encoding +// if (idx >= 0) { +// // add the handler callback to the pattern map, associated with the pattern index +// lg_pattern_info(PatternInfo, idx)->UserData = const_cast(static_cast(&((*h)->Callback))); +// patBegin = std::min(patBegin, static_cast(idx)); +// good = true; +// } +// } + +// // std::cerr << '\t' << (int)((*h)->Options.FixedString) << '\t' << (int)((*h)->Options.CaseInsensitive) << std::endl; +// } +// if (!good) { +// if (scanner.handleParseError(**h, lgErr)) { +// lg_free_error(lgErr); +// lgErr = 0; +// } +// else { +// return false; +// } +// } +// } +// patEnd = lg_pattern_map_size(PatternInfo); +// // record the range of this scanner's patterns in the central pattern map +// scanner.patternRange() = make_pair(patBegin, patEnd); +// Scanners.push_back(&scanner); +// return true; +// } + +// /* note: findopts is now part of scanner_set.scanner_config, you need to pass that in here. */ +// bool LightgrepController::addUserPatterns(PatternScanner& scanner, CallbackFnType* callbackPtr, const FindOpts& user) { +// // Add patterns specified as keywords by the user +// // Similar to above, but does not have a handler per pattern +// unsigned int patBegin = lg_pattern_map_size(PatternInfo), +// patEnd = 0; + +// LG_KeyOptions opts; +// opts.FixedString = 0; +// opts.CaseInsensitive = 0; + +// LG_Error *err = 0; + +// // Add patterns from files +// for (vector::const_iterator itr(user.Files.begin()); itr != user.Files.end(); ++itr) { +// ifstream file(itr->c_str(), ios::in); +// if (!file.is_open()) { +// cerr << "Could not open pattern file '" << *itr << "'." << endl; +// return false; +// } +// string contents = string(istreambuf_iterator(file), istreambuf_iterator()); + +// const char* contentsCStr = contents.c_str(); +// // Add all the patterns from the files in one fell swoop +// if (lg_add_pattern_list(Fsm, PatternInfo, contentsCStr, itr->c_str(), DefaultEncodingsCStrings, 2, &opts, &err) < 0) { +// vector lines; +// istringstream input(contents); +// string line; +// while (input) { +// getline(input, line); +// lines.push_back(line); +// } +// LG_Error* cur(err); +// while (cur) { +// cerr << "Error in " << *itr << ", line " << cur->Index+1 << ", pattern '" << lines[cur->Index] +// << "': " << cur->Message << endl; +// cur = cur->Next; +// } +// lg_free_error(err); +// return false; +// } +// } +// // add patterns from single command-line arguments +// for (vector::const_iterator itr(user.Patterns.begin()); itr != user.Patterns.end(); ++itr) { +// bool good = false; +// if (lg_parse_pattern(ParsedPattern, itr->c_str(), &opts, &err)) { +// for (unsigned int i = 0; i < NumDefaultEncodings; ++i) { +// if (lg_add_pattern(Fsm, PatternInfo, ParsedPattern, DefaultEncodingsCStrings[i], &err) >= 0) { +// good = true; +// } +// } +// } +// if (!good) { +// cerr << "Error on '" << *itr << "': " << err->Message << endl; +// lg_free_error(err); +// return false; +// } +// } +// patEnd = lg_pattern_map_size(PatternInfo); +// for (unsigned int i = patBegin; i < patEnd; ++i) { +// lg_pattern_info(PatternInfo, i)->UserData = const_cast(static_cast(callbackPtr)); +// } +// scanner.patternRange() = make_pair(patBegin, patEnd); +// Scanners.push_back(&scanner); +// return true; +// } + +// void LightgrepController::regcomp() { +// LG_ProgramOptions progOpts; +// progOpts.Determinize = 1; +// // Create an optimized, immutable form of the accumulated automaton +// Prog = lg_create_program(Fsm, &progOpts); +// lg_destroy_fsm(Fsm); + +// cerr << lg_pattern_map_size(PatternInfo) << " lightgrep patterns, logic size is " << lg_program_size(Prog) << " bytes, " << Scanners.size() << " active scanners" << std::endl; +// #ifdef LGBENCHMARK +// cerr << "timer second ratio " << chrono::high_resolution_clock::period::num << "/" << +// chrono::high_resolution_clock::period::den << endl; +// #endif +// } + +// struct HitData { +// // Everything we need for processing a hit +// LightgrepController* lgc; +// const vector* scannerTable; +// const scanner_params* sp; +// //const recursion_control_block* rcb; +// }; + +// void gotHit(void* userData, const LG_SearchHit* hit) { +// #ifdef LGBENCHMARK +// // no callback, just increment hit counter +// ++(*static_cast(userData)); +// #else +// // trampoline back into LightgrepController::processHit() from the void* userData +// HitData* hd(static_cast(userData)); +// hd->lgc->processHit(*hd->scannerTable, *hit, *hd->sp, *hd->rcb); +// #endif +// } + +// void LightgrepController::scan(const scanner_params& sp, const recursion_control_block &rcb) { +// // Scan the sbuf for pattern hits, invoking various scanners' handlers as hits are encountered +// if (!Prog) { +// // we had no valid patterns, do nothing +// return; +// } +// // First, clone all the scanners so that there's no shared data between threads +// vector scannerTable(lg_pattern_map_size(PatternInfo)); // [Keyword Index -> scanner], no ownership +// vector scannerList; // ownership list +// for (vector::const_iterator itr(Scanners.begin()); itr != Scanners.end(); ++itr) { +// PatternScanner *s = (*itr)->clone(); +// scannerList.push_back(s); +// for (unsigned int i = s->patternRange().first; i < s->patternRange().second; ++i) { +// scannerTable[i] = s; +// } +// s->initScan(sp); // let the scanner know we're about to scan an sbuf +// } +// LG_ContextOptions ctxOpts; +// ctxOpts.TraceBegin = 0xffffffffffffffff; +// ctxOpts.TraceEnd = 0; + +// LG_HCONTEXT ctx = lg_create_context(Prog, &ctxOpts); // create a search context; cannot be shared, so local to scan + +// const sbuf_t &sbuf = sp.sbuf; + +// HitData callbackInfo = { this, &scannerTable, &sp, &rcb }; +// void* userData = &callbackInfo; + +// #ifdef LGBENCHMARK // perform timings of lightgrep search functions only -- no callbacks +// uint64_t hitCount = 0; +// userData = &hitCount; // switch things out for a counter + +// auto startClock = std::chrono::high_resolution_clock::now(); +// // std::cout << "Starting block " << sbuf.pos0.str() << std::endl; +// #endif + +// // search the sbuf in one go +// // the gotHit() function will be invoked for each pattern hit +// if (lg_search(ctx, (const char*)sbuf.buf, (const char*)sbuf.buf + sbuf.pagesize, 0, userData, gotHit) < numeric_limits::max()) { +// // resolve potential hits that want data into the sbuf margin, without beginning any new hits +// lg_search_resolve(ctx, (const char*)sbuf.buf + sbuf.pagesize, (const char*)sbuf.buf + sbuf.bufsize, sbuf.pagesize, userData, gotHit); +// } +// // flush any remaining hits; there's no more data +// lg_closeout_search(ctx, userData, gotHit); + +// #ifdef LGBENCHMARK +// auto endClock = std::chrono::high_resolution_clock::now(); +// auto t = endClock - startClock; +// double seconds = double(t.count() * chrono::high_resolution_clock::period::num) / chrono::high_resolution_clock::period::den; +// double bw = double(sbuf.pagesize) / (seconds * 1024 * 1024); +// std::stringstream buf; +// buf << " ** Time: " << sbuf.pos0.str() << '\t' << sbuf.pagesize << '\t' << t.count() << '\t' << seconds<< '\t' << hitCount << '\t' << bw << std::endl; +// std::cout << buf.str(); +// // std::cout.flush(); +// #endif + +// lg_destroy_context(ctx); + +// // don't call PatternScanner::shutdown() on these! that only happens on prototypes +// for (vector::const_iterator itr(scannerList.begin()); itr != scannerList.end(); ++itr) { +// (*itr)->finishScan(sp); // let the scanner know we're done with the sbuf +// delete *itr; +// } +// } + +// void LightgrepController::processHit(const vector& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb) { +// // lookup the handler's callback functor in the pattern map, then invoke it +// CallbackFnType* cbPtr(static_cast(lg_pattern_info(PatternInfo, hit.KeywordIndex)->UserData)); +// ((*sTbl[hit.KeywordIndex]).*(*cbPtr))(hit, sp, rcb); // ...yep... +// } + +// unsigned int LightgrepController::numPatterns() const { +// return lg_pattern_map_size(PatternInfo); +// } + +// /*********************************************************/ + +// void scan_lg(PatternScanner& scanner, class scanner_params &sp) { +// // utility implementation of the normal scan function for a PatternScanner instance +// switch (sp.phase) { +// case scanner_params::PHASE_STARTUP: +// scanner.startup(sp); +// break; +// case scanner_params::PHASE_INIT: +// scanner.init(sp); +// if (!LightgrepController::Get().addScanner(scanner)) { +// // It's fine for user patterns not to parse, but there's no excuse for a scanner so exit. +// cerr << "Aborting. Fix pattern or disable scanner to continue." << endl; +// exit(EXIT_FAILURE); +// } +// break; +// case scanner_params::PHASE_SHUTDOWN: +// scanner.shutdown(sp); +// break; +// case scanner_params::PHASE_CLEANUP: +// TODO - to something here. +// default: +// break; +// } +// } + +// /*********************************************************/ #endif // HAVE_LIBLIGHTGREP diff --git a/src/pattern_scanner.h b/src/pattern_scanner.h index 4a6d4268..1c64a3b2 100644 --- a/src/pattern_scanner.h +++ b/src/pattern_scanner.h @@ -10,32 +10,32 @@ #include -#include "be13/plugin.h" +#include "be20_api/scanner_params.h" using namespace std; class PatternScanner; -/** - * the function prototype for a handler callback - * LG_SearchHit - LightGrep Search Hit. - * scanner_params - the parameters available to the scanner. - * recursion_control_clock - information about where we are in the recursive analysis. - */ +// /** +// * the function prototype for a handler callback +// * LG_SearchHit - LightGrep Search Hit. +// * scanner_params - the parameters available to the scanner. +// * recursion_control_clock - information about where we are in the recursive analysis. +// */ -typedef void (PatternScanner::*CallbackFnType)(const LG_SearchHit&, - const scanner_params& sp, - const recursion_control_block& rcb); +// typedef void (PatternScanner::*CallbackFnType)(const LG_SearchHit&, +// const scanner_params& sp, +// const recursion_control_block& rcb); -/*********************************************************/ +// /*********************************************************/ -struct Handler; +// struct Handler; -// Inherit from this to create your own Lightgrep-based scanners -// clone(), startup(), init(), and initScan() must be overridden +// // Inherit from this to create your own Lightgrep-based scanners +// // clone(), startup(), init(), and initScan() must be overridden class PatternScanner { public: - PatternScanner(const string& n): Name(n), Handlers(), PatternRange(0, 0) {} + PatternScanner(const string& n): Name(n) {} //Handlers(), PatternRange(0, 0) {} virtual ~PatternScanner() {} virtual PatternScanner* clone() const = 0; @@ -53,91 +53,91 @@ class PatternScanner { // return bool indicates whether scanner addition should be continued // default is to print message to stderr and quit parsing scanner patterns - virtual bool handleParseError(const Handler& h, LG_Error* err) const; + // virtual bool handleParseError(const Handler& h, LG_Error* err) const; - virtual void addHandler(const Handler* h) { - Handlers.push_back(h); - } + // virtual void addHandler(const Handler* h) { + // Handlers.push_back(h); + // } - virtual const vector& handlers() const { return Handlers; } + // virtual const vector& handlers() const { return Handlers; } - pair& patternRange() { return PatternRange; } - const pair& patternRange() const { return PatternRange; } + // pair& patternRange() { return PatternRange; } + // const pair& patternRange() const { return PatternRange; } protected: PatternScanner(const PatternScanner& s): - Name(s.Name), Handlers(s.Handlers), PatternRange(s.PatternRange) {} + Name(s.Name) {} //, Handlers(s.Handlers), PatternRange(s.PatternRange) {} string Name; - vector Handlers; + // vector Handlers; - pair PatternRange; // knows the label range of its associated patterns + // pair PatternRange; // knows the label range of its associated patterns }; -/*********************************************************/ - -struct Handler { - // Agglomeration of the scanner, pattern, encodings, parse options, and callback - template - Handler( - PatternScanner& scanner, - const string& re, - const vector& encs, - const LG_KeyOptions& opts, - Fn fn - ): - RE(re), - Encodings(encs), - Options(opts), - Callback(static_cast(fn)) - { - scanner.addHandler(this); - } - - string RE; - - vector Encodings; - - LG_KeyOptions Options; - - CallbackFnType Callback; -}; +// /*********************************************************/ -/*********************************************************/ +// struct Handler { +// // Agglomeration of the scanner, pattern, encodings, parse options, and callback +// template +// Handler( +// PatternScanner& scanner, +// const string& re, +// const vector& encs, +// const LG_KeyOptions& opts, +// Fn fn +// ): +// RE(re), +// Encodings(encs), +// Options(opts), +// Callback(static_cast(fn)) +// { +// scanner.addHandler(this); +// } -class LightgrepController { // Centralized search facility amongst PatternScanners -public: +// string RE; - static LightgrepController& Get(); // singleton instance +// vector Encodings; - bool addScanner(PatternScanner& scanner); - bool addUserPatterns(PatternScanner& scanner, CallbackFnType* callbackPtr, const FindOpts& userPatterns); +// LG_KeyOptions Options; - void regcomp(); - void scan(const scanner_params& sp, const recursion_control_block& rcb); - void processHit(const vector& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb); +// CallbackFnType Callback; +// }; - unsigned int numPatterns() const; +// /*********************************************************/ -private: - LightgrepController(); - LightgrepController(const LightgrepController&); - ~LightgrepController(); +// class LightgrepController { // Centralized search facility amongst PatternScanners +// public: - LightgrepController& operator=(const LightgrepController&); +// static LightgrepController& Get(); // singleton instance - LG_HPATTERN ParsedPattern; - LG_HFSM Fsm; - LG_HPATTERNMAP PatternInfo; - LG_HPROGRAM Prog; +// bool addScanner(PatternScanner& scanner); +// bool addUserPatterns(PatternScanner& scanner, CallbackFnType* callbackPtr, const FindOpts& userPatterns); - vector Scanners; -}; +// void regcomp(); +// void scan(const scanner_params& sp, const recursion_control_block& rcb); +// void processHit(const vector& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb); + +// unsigned int numPatterns() const; + +// private: +// LightgrepController(); +// LightgrepController(const LightgrepController&); +// ~LightgrepController(); + +// LightgrepController& operator=(const LightgrepController&); + +// LG_HPATTERN ParsedPattern; +// LG_HFSM Fsm; +// LG_HPATTERNMAP PatternInfo; +// LG_HPROGRAM Prog; + +// vector Scanners; +// }; -/*********************************************************/ +// /*********************************************************/ -// Utility function. Makes your scan function a one-liner, given a PatternScanner instance -void scan_lg(PatternScanner& scanner, struct scanner_params &sp; +// // Utility function. Makes your scan function a one-liner, given a PatternScanner instance +// void scan_lg(PatternScanner& scanner, struct scanner_params &sp; #endif #endif /* PATTERN_SCANNER_H */ diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index a9615222..c2b48e75 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -11,7 +11,7 @@ #include "be20_api/scanner_params.h" //#include "be20_api/beregex.h" -#include "histogram.h" +#include "be20_api/histogram_def.h" #include "pattern_scanner.h" #include @@ -29,26 +29,26 @@ namespace { // local namespace hides these from other translation units }; virtual void startup(const scanner_params& sp) { - sp.info.set_name("scan_lightgrep"); + sp.info->set_name("scan_lightgrep"); sp.info->author = "Jon Stewart"; sp.info->description = "Advanced search for patterns"; - sp.info->scanner_version = "0.2"; - sp.info->flags = scanner_info::SCANNER_FIND_SCANNER | scanner_info::SCANNER_FAST_FIND; - sp.info->feature_names.insert(name()); - sp.info->histogram_defs.insert(histogram_def( name(), "", "histogram", HistogramMaker::FLAG_LOWERCASE)); + sp.info->scanner_version = "1.0"; + // sp.info->flags = scanner_info::SCANNER_FIND_SCANNER | scanner_info::SCANNER_FAST_FIND; + // sp.info->feature_names.insert(name()); + // sp.info->histogram_defs.insert(histogram_def( name(), "", "histogram", HistogramMaker::FLAG_LOWERCASE)); } virtual void init(const scanner_params& sp) { } virtual void initScan(const scanner_params& sp) { - LgRec = &sp.named_feature_recorder(name()); + // LgRec = &sp.named_feature_recorder(name()); } feature_recorder* LgRec; void processHit(const LG_SearchHit& hit, const scanner_params& sp) { - LgRec->write_buf(sp.sbuf, hit.Start, hit.End - hit.Start); + // LgRec->write_buf(sp.sbuf, hit.Start, hit.End - hit.Start); } private: @@ -59,7 +59,7 @@ namespace { // local namespace hides these from other translation units FindScanner Scanner; - CallbackFnType ProcessHit; + // CallbackFnType ProcessHit; } extern "C" @@ -67,21 +67,28 @@ void scan_lightgrep(struct scanner_params &sp) { switch (sp.phase) { case scanner_params::PHASE_INIT: Scanner.startup(sp); - ProcessHit = static_cast(&FindScanner::processHit); + // ProcessHit = static_cast(&FindScanner::processHit); + break; + case scanner_params::PHASE_INIT2: + // { + // Scanner.init(sp); + // LightgrepController& lg(LightgrepController::Get()); + // lg.addUserPatterns(Scanner, &ProcessHit, sp.ss->sc); // note: FindOpts now passed in ScannerConfig + // lg.regcomp(); + // break; + // } + break; + case scanner_params::PHASE_ENABLED: break; - case scanner_params::PHASE_INIT: - { - Scanner.init(sp); - LightgrepController& lg(LightgrepController::Get()); - lg.addUserPatterns(Scanner, &ProcessHit, sp.ss->sc); // note: FindOpts now passed in ScannerConfig - lg.regcomp(); - break; - } case scanner_params::PHASE_SCAN: - LightgrepController::Get().scan(sp); + // LightgrepController::Get().scan(sp); break; case scanner_params::PHASE_SHUTDOWN: - Scanner.shutdown(sp); + // Scanner.shutdown(sp); + break; + case scanner_params::PHASE_CLEANUP: + break; + case scanner_params::PHASE_CLEANED: break; default: break; From dc034c8e1f872394e60d5d294952774870100d1f Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Wed, 19 Apr 2023 17:20:16 -0400 Subject: [PATCH 03/31] R - comment out unused phases --- src/scan_lightgrep.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index c2b48e75..694a7b39 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -78,18 +78,21 @@ void scan_lightgrep(struct scanner_params &sp) { // break; // } break; - case scanner_params::PHASE_ENABLED: - break; + // PHASE_ENABLED is never current phase when this func is called + // case scanner_params::PHASE_ENABLED: + // break; case scanner_params::PHASE_SCAN: // LightgrepController::Get().scan(sp); break; case scanner_params::PHASE_SHUTDOWN: // Scanner.shutdown(sp); break; - case scanner_params::PHASE_CLEANUP: - break; - case scanner_params::PHASE_CLEANED: - break; + // no cleanup needs to happen because lightgrep controller handles dealloc + // case scanner_params::PHASE_CLEANUP: + // break; + // PHASE_CLEANED is never current phase when this func is called, used for internal bookkeeping + // case scanner_params::PHASE_CLEANED: + // break; default: break; } From d638a2ae6c1cc398a44598df4f6f43779e424f55 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Wed, 26 Apr 2023 10:28:01 -0400 Subject: [PATCH 04/31] F!! - initialization of Lightgrep Controller --- src/pattern_scanner.cpp | 38 +++++++++++++++++++------------------- src/pattern_scanner.h | 26 +++++++++++++------------- src/scan_lightgrep.cpp | 10 +++++----- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index 07220fd2..1a1f82cd 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -7,7 +7,7 @@ // #include "be20_api/histogram_def.h" #include "pattern_scanner.h" -// #include +#include // #include // #include @@ -38,26 +38,26 @@ void PatternScanner::shutdown(const scanner_params&) { } // /*********************************************************/ -// LightgrepController::LightgrepController() -// : ParsedPattern(lg_create_pattern()), // Reuse the parsed pattern data structure for efficiency -// Fsm(lg_create_fsm(1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed -// PatternInfo(lg_create_pattern_map(1000)), // Reserve space for 1000 patterns in the pattern map -// Prog(0), -// Scanners() -// { -// } +LightgrepController::LightgrepController() +: ParsedPattern(lg_create_pattern()), // Reuse the parsed pattern data structure for efficiency + Fsm(lg_create_fsm(1000, 1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed + // PatternInfo(lg_create_pattern_map(1000)), // Reserve space for 1000 patterns in the pattern map + Prog(0), + Scanners() +{ +} -// LightgrepController::~LightgrepController() { -// lg_destroy_pattern(ParsedPattern); -// lg_destroy_pattern_map(PatternInfo); -// lg_destroy_program(Prog); -// } +LightgrepController::~LightgrepController() { + lg_destroy_pattern(ParsedPattern); + // lg_destroy_pattern_map(PatternInfo); + lg_destroy_program(Prog); +} -// LightgrepController& LightgrepController::Get() { -// // Meyers Singleton. c.f. Effective C++ by Scott Meyers -// static LightgrepController controller; -// return controller; -// } +LightgrepController& LightgrepController::Get() { + // Meyers Singleton. c.f. Effective C++ by Scott Meyers + static LightgrepController controller; + return controller; +} // bool LightgrepController::addScanner(PatternScanner& scanner) { // // Add patterns and handlers from a Scanner to the centralized automaton diff --git a/src/pattern_scanner.h b/src/pattern_scanner.h index 1c64a3b2..3795925e 100644 --- a/src/pattern_scanner.h +++ b/src/pattern_scanner.h @@ -105,10 +105,10 @@ class PatternScanner { // /*********************************************************/ -// class LightgrepController { // Centralized search facility amongst PatternScanners -// public: +class LightgrepController { // Centralized search facility amongst PatternScanners +public: -// static LightgrepController& Get(); // singleton instance + static LightgrepController& Get(); // singleton instance // bool addScanner(PatternScanner& scanner); // bool addUserPatterns(PatternScanner& scanner, CallbackFnType* callbackPtr, const FindOpts& userPatterns); @@ -119,20 +119,20 @@ class PatternScanner { // unsigned int numPatterns() const; -// private: -// LightgrepController(); -// LightgrepController(const LightgrepController&); -// ~LightgrepController(); +private: + LightgrepController(); + LightgrepController(const LightgrepController&); + ~LightgrepController(); // LightgrepController& operator=(const LightgrepController&); -// LG_HPATTERN ParsedPattern; -// LG_HFSM Fsm; -// LG_HPATTERNMAP PatternInfo; -// LG_HPROGRAM Prog; + LG_HPATTERN ParsedPattern; + LG_HFSM Fsm; + // LG_HPATTERNMAP PatternInfo; + LG_HPROGRAM Prog; -// vector Scanners; -// }; + vector Scanners; +}; // /*********************************************************/ diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index 694a7b39..534d8e4e 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -70,13 +70,13 @@ void scan_lightgrep(struct scanner_params &sp) { // ProcessHit = static_cast(&FindScanner::processHit); break; case scanner_params::PHASE_INIT2: - // { - // Scanner.init(sp); - // LightgrepController& lg(LightgrepController::Get()); - // lg.addUserPatterns(Scanner, &ProcessHit, sp.ss->sc); // note: FindOpts now passed in ScannerConfig + { + Scanner.init(sp); + LightgrepController& lg(LightgrepController::Get()); + // lg.addUserPatterns(Scanner, &ProcessHit, sp.ss->sc); // note: FindOpts now passed in ScannerConfig // lg.regcomp(); // break; - // } + } break; // PHASE_ENABLED is never current phase when this func is called // case scanner_params::PHASE_ENABLED: From 8ecacd138166b518c2fac7f5c908c1ae773f1af1 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 2 May 2023 17:08:08 -0400 Subject: [PATCH 05/31] R - If PHASE_INIT2 happens >1, Get() causes Fsm to be empty on subsequent passes --- src/pattern_scanner.cpp | 10 +++++----- src/scan_lightgrep.cpp | 9 +++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index 1a1f82cd..2d0e8b92 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -53,11 +53,11 @@ LightgrepController::~LightgrepController() { lg_destroy_program(Prog); } -LightgrepController& LightgrepController::Get() { - // Meyers Singleton. c.f. Effective C++ by Scott Meyers - static LightgrepController controller; - return controller; -} +// LightgrepController& LightgrepController::Get() { +// // Meyers Singleton. c.f. Effective C++ by Scott Meyers +// static LightgrepController controller; +// return controller; +// } // bool LightgrepController::addScanner(PatternScanner& scanner) { // // Add patterns and handlers from a Scanner to the centralized automaton diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index 534d8e4e..363d77f9 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -64,6 +64,7 @@ namespace { // local namespace hides these from other translation units extern "C" void scan_lightgrep(struct scanner_params &sp) { + static std::unique_ptr lg_ptr; switch (sp.phase) { case scanner_params::PHASE_INIT: Scanner.startup(sp); @@ -72,9 +73,9 @@ void scan_lightgrep(struct scanner_params &sp) { case scanner_params::PHASE_INIT2: { Scanner.init(sp); - LightgrepController& lg(LightgrepController::Get()); - // lg.addUserPatterns(Scanner, &ProcessHit, sp.ss->sc); // note: FindOpts now passed in ScannerConfig - // lg.regcomp(); + lg_ptr.reset(new LightgrepController); + lg_ptr->addUserPatterns(Scanner/*, sp.ss->sc*/); // &ProcessHit, sp.ss->sc); // note: FindOpts now passed in ScannerConfig + lg_ptr->regcomp(); // break; } break; @@ -82,7 +83,7 @@ void scan_lightgrep(struct scanner_params &sp) { // case scanner_params::PHASE_ENABLED: // break; case scanner_params::PHASE_SCAN: - // LightgrepController::Get().scan(sp); + lg_ptr->scan(sp); break; case scanner_params::PHASE_SHUTDOWN: // Scanner.shutdown(sp); From ee7b242e46c9f58ca474cb9315396848a796321c Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 2 May 2023 17:15:25 -0400 Subject: [PATCH 06/31] F - simplify addUserPatterns for now --- src/pattern_scanner.cpp | 209 ++++++++++++++++------------------------ 1 file changed, 82 insertions(+), 127 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index 2d0e8b92..fec57178 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -101,133 +101,88 @@ LightgrepController::~LightgrepController() { // return true; // } -// /* note: findopts is now part of scanner_set.scanner_config, you need to pass that in here. */ -// bool LightgrepController::addUserPatterns(PatternScanner& scanner, CallbackFnType* callbackPtr, const FindOpts& user) { -// // Add patterns specified as keywords by the user -// // Similar to above, but does not have a handler per pattern -// unsigned int patBegin = lg_pattern_map_size(PatternInfo), -// patEnd = 0; - -// LG_KeyOptions opts; -// opts.FixedString = 0; -// opts.CaseInsensitive = 0; - -// LG_Error *err = 0; - -// // Add patterns from files -// for (vector::const_iterator itr(user.Files.begin()); itr != user.Files.end(); ++itr) { -// ifstream file(itr->c_str(), ios::in); -// if (!file.is_open()) { -// cerr << "Could not open pattern file '" << *itr << "'." << endl; -// return false; -// } -// string contents = string(istreambuf_iterator(file), istreambuf_iterator()); - -// const char* contentsCStr = contents.c_str(); -// // Add all the patterns from the files in one fell swoop -// if (lg_add_pattern_list(Fsm, PatternInfo, contentsCStr, itr->c_str(), DefaultEncodingsCStrings, 2, &opts, &err) < 0) { -// vector lines; -// istringstream input(contents); -// string line; -// while (input) { -// getline(input, line); -// lines.push_back(line); -// } -// LG_Error* cur(err); -// while (cur) { -// cerr << "Error in " << *itr << ", line " << cur->Index+1 << ", pattern '" << lines[cur->Index] -// << "': " << cur->Message << endl; -// cur = cur->Next; -// } -// lg_free_error(err); -// return false; -// } -// } -// // add patterns from single command-line arguments -// for (vector::const_iterator itr(user.Patterns.begin()); itr != user.Patterns.end(); ++itr) { -// bool good = false; -// if (lg_parse_pattern(ParsedPattern, itr->c_str(), &opts, &err)) { -// for (unsigned int i = 0; i < NumDefaultEncodings; ++i) { -// if (lg_add_pattern(Fsm, PatternInfo, ParsedPattern, DefaultEncodingsCStrings[i], &err) >= 0) { -// good = true; -// } -// } -// } -// if (!good) { -// cerr << "Error on '" << *itr << "': " << err->Message << endl; -// lg_free_error(err); -// return false; -// } -// } -// patEnd = lg_pattern_map_size(PatternInfo); -// for (unsigned int i = patBegin; i < patEnd; ++i) { -// lg_pattern_info(PatternInfo, i)->UserData = const_cast(static_cast(callbackPtr)); -// } -// scanner.patternRange() = make_pair(patBegin, patEnd); -// Scanners.push_back(&scanner); -// return true; -// } - -// void LightgrepController::regcomp() { -// LG_ProgramOptions progOpts; -// progOpts.Determinize = 1; -// // Create an optimized, immutable form of the accumulated automaton -// Prog = lg_create_program(Fsm, &progOpts); -// lg_destroy_fsm(Fsm); - -// cerr << lg_pattern_map_size(PatternInfo) << " lightgrep patterns, logic size is " << lg_program_size(Prog) << " bytes, " << Scanners.size() << " active scanners" << std::endl; -// #ifdef LGBENCHMARK -// cerr << "timer second ratio " << chrono::high_resolution_clock::period::num << "/" << -// chrono::high_resolution_clock::period::den << endl; -// #endif -// } - -// struct HitData { -// // Everything we need for processing a hit -// LightgrepController* lgc; -// const vector* scannerTable; -// const scanner_params* sp; -// //const recursion_control_block* rcb; -// }; - -// void gotHit(void* userData, const LG_SearchHit* hit) { -// #ifdef LGBENCHMARK -// // no callback, just increment hit counter -// ++(*static_cast(userData)); -// #else -// // trampoline back into LightgrepController::processHit() from the void* userData -// HitData* hd(static_cast(userData)); -// hd->lgc->processHit(*hd->scannerTable, *hit, *hd->sp, *hd->rcb); -// #endif -// } - -// void LightgrepController::scan(const scanner_params& sp, const recursion_control_block &rcb) { -// // Scan the sbuf for pattern hits, invoking various scanners' handlers as hits are encountered -// if (!Prog) { -// // we had no valid patterns, do nothing -// return; -// } -// // First, clone all the scanners so that there's no shared data between threads -// vector scannerTable(lg_pattern_map_size(PatternInfo)); // [Keyword Index -> scanner], no ownership -// vector scannerList; // ownership list -// for (vector::const_iterator itr(Scanners.begin()); itr != Scanners.end(); ++itr) { -// PatternScanner *s = (*itr)->clone(); -// scannerList.push_back(s); -// for (unsigned int i = s->patternRange().first; i < s->patternRange().second; ++i) { -// scannerTable[i] = s; -// } -// s->initScan(sp); // let the scanner know we're about to scan an sbuf -// } -// LG_ContextOptions ctxOpts; -// ctxOpts.TraceBegin = 0xffffffffffffffff; -// ctxOpts.TraceEnd = 0; - -// LG_HCONTEXT ctx = lg_create_context(Prog, &ctxOpts); // create a search context; cannot be shared, so local to scan - -// const sbuf_t &sbuf = sp.sbuf; - -// HitData callbackInfo = { this, &scannerTable, &sp, &rcb }; -// void* userData = &callbackInfo; +/* note: findopts is now part of scanner_set.scanner_config, you need to pass that in here. */ +bool LightgrepController::addUserPatterns(PatternScanner& scanner /* const FindOpts& user*/ ) { // CallbackFnType* callbackPtr, const FindOpts& user) { + + LG_Error *err = 0; + + LG_KeyOptions opts; + opts.FixedString = 0; + opts.CaseInsensitive = 0; + + int result = lg_parse_pattern(ParsedPattern, "julia", &opts, &err); + + if (result == 0) { + int index = lg_add_pattern(Fsm, ParsedPattern, "US-ASCII", 0, &err); + if (index >= 0) { + return true; + } + } + + // // Add patterns specified as keywords by the user + // // Similar to above, but does not have a handler per pattern + // unsigned int patBegin = lg_pattern_map_size(PatternInfo), + // patEnd = 0; + + // LG_KeyOptions opts; + // opts.FixedString = 0; + // opts.CaseInsensitive = 0; + + // LG_Error *err = 0; + + // // Add patterns from files + // for (vector::const_iterator itr(user.Files.begin()); itr != user.Files.end(); ++itr) { + // ifstream file(itr->c_str(), ios::in); + // if (!file.is_open()) { + // cerr << "Could not open pattern file '" << *itr << "'." << endl; + // return false; + // } + // string contents = string(istreambuf_iterator(file), istreambuf_iterator()); + + // const char* contentsCStr = contents.c_str(); + // // Add all the patterns from the files in one fell swoop + // if (lg_add_pattern_list(Fsm, PatternInfo, contentsCStr, itr->c_str(), DefaultEncodingsCStrings, 2, &opts, &err) < 0) { + // vector lines; + // istringstream input(contents); + // string line; + // while (input) { + // getline(input, line); + // lines.push_back(line); + // } + // LG_Error* cur(err); + // while (cur) { + // cerr << "Error in " << *itr << ", line " << cur->Index+1 << ", pattern '" << lines[cur->Index] + // << "': " << cur->Message << endl; + // cur = cur->Next; + // } + // lg_free_error(err); + // return false; + // } + // } + // // add patterns from single command-line arguments + // for (vector::const_iterator itr(user.Patterns.begin()); itr != user.Patterns.end(); ++itr) { + // bool good = false; + // if (lg_parse_pattern(ParsedPattern, itr->c_str(), &opts, &err)) { + // for (unsigned int i = 0; i < NumDefaultEncodings; ++i) { + // if (lg_add_pattern(Fsm, PatternInfo, ParsedPattern, DefaultEncodingsCStrings[i], &err) >= 0) { + // good = true; + // } + // } + // } + // if (!good) { + // cerr << "Error on '" << *itr << "': " << err->Message << endl; + // lg_free_error(err); + // return false; + // } + // } + // patEnd = lg_pattern_map_size(PatternInfo); + // for (unsigned int i = patBegin; i < patEnd; ++i) { + // lg_pattern_info(PatternInfo, i)->UserData = const_cast(static_cast(callbackPtr)); + // } + // scanner.patternRange() = make_pair(patBegin, patEnd); + // Scanners.push_back(&scanner); + return false; +} // #ifdef LGBENCHMARK // perform timings of lightgrep search functions only -- no callbacks // uint64_t hitCount = 0; From 7e1eefa9e5f9ccb466b46c176e20b5c706ecb8ea Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 2 May 2023 17:18:19 -0400 Subject: [PATCH 07/31] F - make regcomp, gotHit, HitData, and scan work with new skeleton --- src/pattern_scanner.cpp | 114 +++++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 36 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index fec57178..c83695ee 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -184,42 +184,84 @@ bool LightgrepController::addUserPatterns(PatternScanner& scanner /* const FindO return false; } -// #ifdef LGBENCHMARK // perform timings of lightgrep search functions only -- no callbacks -// uint64_t hitCount = 0; -// userData = &hitCount; // switch things out for a counter - -// auto startClock = std::chrono::high_resolution_clock::now(); -// // std::cout << "Starting block " << sbuf.pos0.str() << std::endl; -// #endif - -// // search the sbuf in one go -// // the gotHit() function will be invoked for each pattern hit -// if (lg_search(ctx, (const char*)sbuf.buf, (const char*)sbuf.buf + sbuf.pagesize, 0, userData, gotHit) < numeric_limits::max()) { -// // resolve potential hits that want data into the sbuf margin, without beginning any new hits -// lg_search_resolve(ctx, (const char*)sbuf.buf + sbuf.pagesize, (const char*)sbuf.buf + sbuf.bufsize, sbuf.pagesize, userData, gotHit); -// } -// // flush any remaining hits; there's no more data -// lg_closeout_search(ctx, userData, gotHit); - -// #ifdef LGBENCHMARK -// auto endClock = std::chrono::high_resolution_clock::now(); -// auto t = endClock - startClock; -// double seconds = double(t.count() * chrono::high_resolution_clock::period::num) / chrono::high_resolution_clock::period::den; -// double bw = double(sbuf.pagesize) / (seconds * 1024 * 1024); -// std::stringstream buf; -// buf << " ** Time: " << sbuf.pos0.str() << '\t' << sbuf.pagesize << '\t' << t.count() << '\t' << seconds<< '\t' << hitCount << '\t' << bw << std::endl; -// std::cout << buf.str(); -// // std::cout.flush(); -// #endif - -// lg_destroy_context(ctx); - -// // don't call PatternScanner::shutdown() on these! that only happens on prototypes -// for (vector::const_iterator itr(scannerList.begin()); itr != scannerList.end(); ++itr) { -// (*itr)->finishScan(sp); // let the scanner know we're done with the sbuf -// delete *itr; -// } -// } +void LightgrepController::regcomp() { + LG_ProgramOptions progOpts; + progOpts.DeterminizeDepth = 10; + // Create an optimized, immutable form of the accumulated automaton + Prog = lg_create_program(Fsm, &progOpts); + lg_destroy_fsm(Fsm); + Fsm = 0; + + // cerr << lg_pattern_map_size(PatternInfo) << " lightgrep patterns, logic size is " << lg_program_size(Prog) << " bytes, " << Scanners.size() << " active scanners" << std::endl; + #ifdef LGBENCHMARK + cerr << "timer second ratio " << chrono::high_resolution_clock::period::num << "/" << + chrono::high_resolution_clock::period::den << endl; + #endif +} + +struct HitData { + feature_recorder &recorder; + const sbuf_t &sbuf; +}; + +void gotHit(void* userData, const LG_SearchHit* hit) { + #ifdef LGBENCHMARK + // no callback, just increment hit counter + ++(*static_cast(userData)); + #else + // trampoline back into LightgrepController::processHit() from the void* userData + HitData* data(reinterpret_cast(userData)); + // data->recorder.write_buf(sbuf, pos+offset, len); + #endif +} + +void LightgrepController::scan(const scanner_params& sp) { + // Scan the sbuf for pattern hits, invoking various scanners' handlers as hits are encountered + if (!Prog) { + // we had no valid patterns, do nothing + return; + } + + LG_ContextOptions ctxOpts; + ctxOpts.TraceBegin = 0xffffffffffffffff; + ctxOpts.TraceEnd = 0; + + LG_HCONTEXT ctx = lg_create_context(Prog, &ctxOpts); // create a search context; cannot be shared, so local to scan + + const sbuf_t &sbuf = *sp.sbuf; + HitData callbackInfo = { sp.named_feature_recorder("lightgrep"), *sp.sbuf }; + void* userData = &callbackInfo; + + #ifdef LGBENCHMARK // perform timings of lightgrep search functions only -- no callbacks + uint64_t hitCount = 0; + userData = &hitCount; // switch things out for a counter + + auto startClock = std::chrono::high_resolution_clock::now(); + // std::cout << "Starting block " << sbuf.pos0.str() << std::endl; + #endif + + // search the sbuf in one go + // the gotHit() function will be invoked for each pattern hit + if (lg_search(ctx, (const char*)sbuf.get_buf(), (const char*)sbuf.get_buf() + sbuf.pagesize, 0, userData, nullptr/*gotHit*/) < numeric_limits::max()) { + // resolve potential hits that want data into the sbuf margin, without beginning any new hits + lg_search_resolve(ctx, (const char*)sbuf.get_buf() + sbuf.pagesize, (const char*)sbuf.get_buf() + sbuf.bufsize, sbuf.pagesize, userData, nullptr/*gotHit*/); + } + // flush any remaining hits; there's no more data + lg_closeout_search(ctx, userData, nullptr/*gotHit*/); + + #ifdef LGBENCHMARK + auto endClock = std::chrono::high_resolution_clock::now(); + auto t = endClock - startClock; + double seconds = double(t.count() * chrono::high_resolution_clock::period::num) / chrono::high_resolution_clock::period::den; + double bw = double(sbuf.pagesize) / (seconds * 1024 * 1024); + std::stringstream buf; + buf << " ** Time: " << sbuf.pos0.str() << '\t' << sbuf.pagesize << '\t' << t.count() << '\t' << seconds<< '\t' << hitCount << '\t' << bw << std::endl; + std::cout << buf.str(); +// std::cout.flush(); + #endif + + lg_destroy_context(ctx); +} // void LightgrepController::processHit(const vector& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb) { // // lookup the handler's callback functor in the pattern map, then invoke it From c629d3a4eb13608b190d0ca8daaadb3039db5da8 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 2 May 2023 17:19:23 -0400 Subject: [PATCH 08/31] R - Prog may not be initialized before numPatterns is called --- src/pattern_scanner.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index c83695ee..e61489d5 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -269,9 +269,9 @@ void LightgrepController::scan(const scanner_params& sp) { // ((*sTbl[hit.KeywordIndex]).*(*cbPtr))(hit, sp, rcb); // ...yep... // } -// unsigned int LightgrepController::numPatterns() const { -// return lg_pattern_map_size(PatternInfo); -// } +unsigned int LightgrepController::numPatterns() const { + return Prog ? lg_prog_pattern_count(Prog) : 0; //lg_pattern_map_size(PatternInfo); +} // /*********************************************************/ From 72c69a80b1c53f8479a5b5ba5cfd8aeacc5e395d Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 2 May 2023 17:20:18 -0400 Subject: [PATCH 09/31] R - fix declarations --- src/pattern_scanner.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/pattern_scanner.h b/src/pattern_scanner.h index 3795925e..45f08728 100644 --- a/src/pattern_scanner.h +++ b/src/pattern_scanner.h @@ -108,22 +108,21 @@ class PatternScanner { class LightgrepController { // Centralized search facility amongst PatternScanners public: - static LightgrepController& Get(); // singleton instance + LightgrepController(); + LightgrepController(const LightgrepController&); + ~LightgrepController(); + // static LightgrepController& Get(); // singleton instance // bool addScanner(PatternScanner& scanner); -// bool addUserPatterns(PatternScanner& scanner, CallbackFnType* callbackPtr, const FindOpts& userPatterns); + bool addUserPatterns(PatternScanner& scanner/*, const FindOpts& userPatterns*/); // CallbackFnType* callbackPtr, const FindOpts& userPatterns); -// void regcomp(); -// void scan(const scanner_params& sp, const recursion_control_block& rcb); + void regcomp(); + void scan(const scanner_params& sp); // void processHit(const vector& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb); -// unsigned int numPatterns() const; + unsigned int numPatterns() const; private: - LightgrepController(); - LightgrepController(const LightgrepController&); - ~LightgrepController(); - // LightgrepController& operator=(const LightgrepController&); LG_HPATTERN ParsedPattern; From 99bc0307b2027b7addd6f485f0092426006277ea Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 2 May 2023 17:21:23 -0400 Subject: [PATCH 10/31] F - Append lightgrep feature recorder to feature defs --- src/scan_lightgrep.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index 363d77f9..68f4597c 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -33,6 +33,7 @@ namespace { // local namespace hides these from other translation units sp.info->author = "Jon Stewart"; sp.info->description = "Advanced search for patterns"; sp.info->scanner_version = "1.0"; + sp.info->feature_defs.push_back( feature_recorder_def("lightgrep")); // sp.info->flags = scanner_info::SCANNER_FIND_SCANNER | scanner_info::SCANNER_FAST_FIND; // sp.info->feature_names.insert(name()); // sp.info->histogram_defs.insert(histogram_def( name(), "", "histogram", HistogramMaker::FLAG_LOWERCASE)); From 1c1c4d0c276ede661d9a3cd08acc3c0e3b9ef280 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 4 May 2023 15:39:52 -0400 Subject: [PATCH 11/31] Delete unused scanner --- src/scan_httpheader_lg.cpp | 93 -------------------------------------- 1 file changed, 93 deletions(-) delete mode 100644 src/scan_httpheader_lg.cpp diff --git a/src/scan_httpheader_lg.cpp b/src/scan_httpheader_lg.cpp deleted file mode 100644 index 4b0d37fc..00000000 --- a/src/scan_httpheader_lg.cpp +++ /dev/null @@ -1,93 +0,0 @@ -#include - -namespace httpheader { - // - // subpatterns - // - - const std::string PC("[\\x20-\\x7E]"); - - const std::string XPC("[\\x20-\\x7E--\"]"); - - /* - * RFC 2616, Page 12 - */ - /* Account for over-zealously translated line breaks */ - /* HTTP_LWS - Linear White Space (new line and a whitespace character) */ - const std::string HTTP_CRLF("(\\x0D?\\x0A)"); - const std::string HTTP_LWS(HTTP_CRLF + "[ \\t]"); - - /* - * Keeping it simple: no HTTP_CTEXT, HTTP_QUOTED_PAIR, or keeping count - * of parentheses. The distinguishing part of COMMENTs is they are - * allowed to have line breaks, if followed by whitespace. - * - * TODO Might still need to account for RFC 2407. - */ - const std::string HTTP_COMMENT("(" + PC + "|" + HTTP_LWS + "|\\t)"); - - // - // patterns - // - - /* - * RFC 2616, Sections 14.38 and 14.43 - * These fields are allowed multi-line values (comments). - * - * For some reason, specifying the field value as: - * ({XPC}|{HTTP_LWS})+ - * causes the NFA rule set to explode to >32000 rules, making flex refuse - * to compile. - */ - const std::string SERVER_OR_UA("(Server|User-Agent):[ \\t]?" + PC + "{1,80}"); - - /* - * RFC 2616, Section 14.23 - */ - const std::string HOST("Host:[ \\t]?[a-zA-Z0-9._:]{1,256}"); - - /* - * These headers have a general set of characters allowed in their field - * value, including double-quote. - * - * Keep-Alive is defined in RFC 2068, Section 19.7.1.1. Allowable tokens - * seem to include doublequote, per "value" definition in RFC 2068, - * Section 3.7. - * - * Authorization, Proxy-Authenticate, Proxy-Authorization and WWW- - * Authenticate are defined in RFC 2617, not yet reviewed. Assuming PC - * character set allowed for now. - * - * Content-Location, Location and Referer (RFC 2616, Sections 14.14, 14.30 - * and 14.36) have a URI as the field value. - * SLG: Limited to 80 characters - */ - const std::string HEADERS_1("(Accept|Accept-Ranges|Authorization|Cache-Control|Content-Location|Etag|Expect|Keep-Alive|If-Match|If-None-Match|If-Range|Pragma|Proxy-Authenticate|Proxy-Authorization|Referer|TE|Transfer-Encoding|Warning|WWW-Authenticate):[ \\t]?" + PC + "{1,80}"); - - /* - * These headers have a general set of characters allowed in their field - * value, excluding double-quote. - * - * Date and If-Modified-Since reference RFCs 1123 and 850 (RFC 2616 - * Section 3.3.1), not yet reviewed. - * Double-quotes are assumed excluded. - * - * Set-Cookie: RFC 6265, Section 4.1.1, Page 9 - * This header field is allowed to be sent multiple times in the same - * header. - * - * Cookie: RFC 6265, Section 4.2.1, Page 13 - * The cookie length does not seem to have a limit, but cookie stores should - * be able to store at least 4096 bytes for a cookie [RFC 6265, Section 6.1]. - * - * From: should contain an email address. - */ - const std::string HEADERS_2("(Accept-Charset|Accept-Encoding|Accept-Language|Age|Allow|Connection|Content-Encoding|Content-Language|Content-MD5|Content-Range|Content-Type|Cookie|Date|From|If-Modified-Since|If-Unmodified-Since|Last-Modified|Range|Retry-After|Set-Cookie|Trailer|Upgrade|Vary):[ \\t]?" + XPC + "{1,80}"); - - const std::string VIA("Via:[ \\t]?" + HTTP_COMMENT + "{1,256}"); - - /* - * RFC 2616, Sections 14.13 and 14.31 - */ - const std::string HEADERS_3("(Content-Length|Max-Forwards):[ \\t]?[0-9]{1,12}"); -} From 4579d9c8bddbc3f913124d48a32704e1be57ab33 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 4 May 2023 15:59:53 -0400 Subject: [PATCH 12/31] give write_buf good args --- src/pattern_scanner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index e61489d5..ba21dfab 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -211,7 +211,7 @@ void gotHit(void* userData, const LG_SearchHit* hit) { #else // trampoline back into LightgrepController::processHit() from the void* userData HitData* data(reinterpret_cast(userData)); - // data->recorder.write_buf(sbuf, pos+offset, len); + data->recorder.write_buf(data->sbuf, hit->Start, hit->End - hit->Start); #endif } From 75533056b20e3bbf5c62eb3cbfc90d92b8aa2e03 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 4 May 2023 16:55:27 -0400 Subject: [PATCH 13/31] Result is positive upon success --- src/pattern_scanner.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index ba21dfab..48aa012a 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -110,9 +110,9 @@ bool LightgrepController::addUserPatterns(PatternScanner& scanner /* const FindO opts.FixedString = 0; opts.CaseInsensitive = 0; - int result = lg_parse_pattern(ParsedPattern, "julia", &opts, &err); + int result = lg_parse_pattern(ParsedPattern, "patricia", &opts, &err); - if (result == 0) { + if (result > 0) { int index = lg_add_pattern(Fsm, ParsedPattern, "US-ASCII", 0, &err); if (index >= 0) { return true; @@ -242,12 +242,12 @@ void LightgrepController::scan(const scanner_params& sp) { // search the sbuf in one go // the gotHit() function will be invoked for each pattern hit - if (lg_search(ctx, (const char*)sbuf.get_buf(), (const char*)sbuf.get_buf() + sbuf.pagesize, 0, userData, nullptr/*gotHit*/) < numeric_limits::max()) { + if (lg_search(ctx, (const char*)sbuf.get_buf(), (const char*)sbuf.get_buf() + sbuf.pagesize, 0, userData, gotHit) < numeric_limits::max()) { // resolve potential hits that want data into the sbuf margin, without beginning any new hits - lg_search_resolve(ctx, (const char*)sbuf.get_buf() + sbuf.pagesize, (const char*)sbuf.get_buf() + sbuf.bufsize, sbuf.pagesize, userData, nullptr/*gotHit*/); + lg_search_resolve(ctx, (const char*)sbuf.get_buf() + sbuf.pagesize, (const char*)sbuf.get_buf() + sbuf.bufsize, sbuf.pagesize, userData, gotHit); } // flush any remaining hits; there's no more data - lg_closeout_search(ctx, userData, nullptr/*gotHit*/); + lg_closeout_search(ctx, userData, gotHit); #ifdef LGBENCHMARK auto endClock = std::chrono::high_resolution_clock::now(); From a3616e0ff7fe4452a13457d9565ce2f4869f0001 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 9 May 2023 18:23:06 -0400 Subject: [PATCH 14/31] F!! enable mulitple patterns passed through CLI --- src/pattern_scanner.cpp | 37 +++++++++++++++++++++++++------------ src/pattern_scanner.h | 2 +- src/scan_lightgrep.cpp | 3 ++- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index 48aa012a..e21e8240 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -6,6 +6,7 @@ // // #include "beregex.h" // #include "be20_api/histogram_def.h" #include "pattern_scanner.h" +#include "scanner_set.h" #include @@ -20,10 +21,10 @@ // #include // #endif -// namespace { -// const char* DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; -// const unsigned int NumDefaultEncodings = 2; -// } +namespace { + const char* DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; + const unsigned int NumDefaultEncodings = 2; +} // bool PatternScanner::handleParseError(const Handler& h, LG_Error* err) const { // cerr << "Parse error on '" << h.RE << "' in " << Name @@ -102,7 +103,7 @@ LightgrepController::~LightgrepController() { // } /* note: findopts is now part of scanner_set.scanner_config, you need to pass that in here. */ -bool LightgrepController::addUserPatterns(PatternScanner& scanner /* const FindOpts& user*/ ) { // CallbackFnType* callbackPtr, const FindOpts& user) { +bool LightgrepController::addUserPatterns(PatternScanner& scanner, const vector& cli_patterns ) { // CallbackFnType* callbackPtr, const FindOpts& user) { LG_Error *err = 0; @@ -110,12 +111,24 @@ bool LightgrepController::addUserPatterns(PatternScanner& scanner /* const FindO opts.FixedString = 0; opts.CaseInsensitive = 0; - int result = lg_parse_pattern(ParsedPattern, "patricia", &opts, &err); - - if (result > 0) { - int index = lg_add_pattern(Fsm, ParsedPattern, "US-ASCII", 0, &err); - if (index >= 0) { - return true; + bool good = true; + + // add patterns from single command-line arguments + for (const auto& itr: cli_patterns) { + if (lg_parse_pattern(ParsedPattern, itr.c_str(), &opts, &err)) { + for (unsigned int i = 0; i < NumDefaultEncodings; ++i) { + if (lg_add_pattern(Fsm, ParsedPattern, DefaultEncodingsCStrings[i], 0, &err) < 0) { + good = false; + break; + } + } + } else { + good = false; + } + if (!good) { + cerr << "Error on '" << itr.c_str() << "': " << err->Message << endl; + lg_free_error(err); + return false; } } @@ -181,7 +194,7 @@ bool LightgrepController::addUserPatterns(PatternScanner& scanner /* const FindO // } // scanner.patternRange() = make_pair(patBegin, patEnd); // Scanners.push_back(&scanner); - return false; + return true; } void LightgrepController::regcomp() { diff --git a/src/pattern_scanner.h b/src/pattern_scanner.h index 45f08728..43d43120 100644 --- a/src/pattern_scanner.h +++ b/src/pattern_scanner.h @@ -114,7 +114,7 @@ class LightgrepController { // Centralized search facility amongst PatternScanne // static LightgrepController& Get(); // singleton instance // bool addScanner(PatternScanner& scanner); - bool addUserPatterns(PatternScanner& scanner/*, const FindOpts& userPatterns*/); // CallbackFnType* callbackPtr, const FindOpts& userPatterns); + bool addUserPatterns(PatternScanner& scanner, const vector& cli_patterns); void regcomp(); void scan(const scanner_params& sp); diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index 68f4597c..5828ba15 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -9,6 +9,7 @@ #include #include "be20_api/scanner_params.h" +#include "be20_api/scanner_set.h" //#include "be20_api/beregex.h" #include "be20_api/histogram_def.h" @@ -75,7 +76,7 @@ void scan_lightgrep(struct scanner_params &sp) { { Scanner.init(sp); lg_ptr.reset(new LightgrepController); - lg_ptr->addUserPatterns(Scanner/*, sp.ss->sc*/); // &ProcessHit, sp.ss->sc); // note: FindOpts now passed in ScannerConfig + lg_ptr->addUserPatterns(Scanner, sp.ss->find_patterns()); lg_ptr->regcomp(); // break; } From 5097c1ccf02c4c83e57e02eb9c6e29ec7d2b57f3 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 9 May 2023 18:34:13 -0400 Subject: [PATCH 15/31] F - add user_files param to addUserPatterns to avoid scanner config access --- src/pattern_scanner.cpp | 38 +++++--------------------------------- src/pattern_scanner.h | 2 +- src/scan_lightgrep.cpp | 2 +- 3 files changed, 7 insertions(+), 35 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index e21e8240..e5b6f1c9 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -103,7 +103,10 @@ LightgrepController::~LightgrepController() { // } /* note: findopts is now part of scanner_set.scanner_config, you need to pass that in here. */ -bool LightgrepController::addUserPatterns(PatternScanner& scanner, const vector& cli_patterns ) { // CallbackFnType* callbackPtr, const FindOpts& user) { +bool LightgrepController::addUserPatterns( + PatternScanner& scanner, + const vector& cli_patterns, + const vector& user_files) { // CallbackFnType* callbackPtr, const FindOpts& user) { LG_Error *err = 0; @@ -114,7 +117,7 @@ bool LightgrepController::addUserPatterns(PatternScanner& scanner, const vector< bool good = true; // add patterns from single command-line arguments - for (const auto& itr: cli_patterns) { + for (const auto& itr : cli_patterns) { if (lg_parse_pattern(ParsedPattern, itr.c_str(), &opts, &err)) { for (unsigned int i = 0; i < NumDefaultEncodings; ++i) { if (lg_add_pattern(Fsm, ParsedPattern, DefaultEncodingsCStrings[i], 0, &err) < 0) { @@ -132,17 +135,6 @@ bool LightgrepController::addUserPatterns(PatternScanner& scanner, const vector< } } - // // Add patterns specified as keywords by the user - // // Similar to above, but does not have a handler per pattern - // unsigned int patBegin = lg_pattern_map_size(PatternInfo), - // patEnd = 0; - - // LG_KeyOptions opts; - // opts.FixedString = 0; - // opts.CaseInsensitive = 0; - - // LG_Error *err = 0; - // // Add patterns from files // for (vector::const_iterator itr(user.Files.begin()); itr != user.Files.end(); ++itr) { // ifstream file(itr->c_str(), ios::in); @@ -172,26 +164,6 @@ bool LightgrepController::addUserPatterns(PatternScanner& scanner, const vector< // return false; // } // } - // // add patterns from single command-line arguments - // for (vector::const_iterator itr(user.Patterns.begin()); itr != user.Patterns.end(); ++itr) { - // bool good = false; - // if (lg_parse_pattern(ParsedPattern, itr->c_str(), &opts, &err)) { - // for (unsigned int i = 0; i < NumDefaultEncodings; ++i) { - // if (lg_add_pattern(Fsm, PatternInfo, ParsedPattern, DefaultEncodingsCStrings[i], &err) >= 0) { - // good = true; - // } - // } - // } - // if (!good) { - // cerr << "Error on '" << *itr << "': " << err->Message << endl; - // lg_free_error(err); - // return false; - // } - // } - // patEnd = lg_pattern_map_size(PatternInfo); - // for (unsigned int i = patBegin; i < patEnd; ++i) { - // lg_pattern_info(PatternInfo, i)->UserData = const_cast(static_cast(callbackPtr)); - // } // scanner.patternRange() = make_pair(patBegin, patEnd); // Scanners.push_back(&scanner); return true; diff --git a/src/pattern_scanner.h b/src/pattern_scanner.h index 43d43120..15a874c5 100644 --- a/src/pattern_scanner.h +++ b/src/pattern_scanner.h @@ -114,7 +114,7 @@ class LightgrepController { // Centralized search facility amongst PatternScanne // static LightgrepController& Get(); // singleton instance // bool addScanner(PatternScanner& scanner); - bool addUserPatterns(PatternScanner& scanner, const vector& cli_patterns); + bool addUserPatterns(PatternScanner& scanner, const vector& cli_patterns, const vector& user_files); void regcomp(); void scan(const scanner_params& sp); diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index 5828ba15..ff41c90a 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -76,7 +76,7 @@ void scan_lightgrep(struct scanner_params &sp) { { Scanner.init(sp); lg_ptr.reset(new LightgrepController); - lg_ptr->addUserPatterns(Scanner, sp.ss->find_patterns()); + lg_ptr->addUserPatterns(Scanner, sp.ss->find_patterns(), sp.ss->find_files()); lg_ptr->regcomp(); // break; } From 92b7f75af8e3124e67e7e8f5cd27c75e1dd192c5 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 9 May 2023 19:16:52 -0400 Subject: [PATCH 16/31] F - Handle user files with parsed patterns in lightgrep scanner --- src/pattern_scanner.cpp | 62 ++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index e5b6f1c9..616e9010 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -26,6 +26,7 @@ namespace { const unsigned int NumDefaultEncodings = 2; } + // bool PatternScanner::handleParseError(const Handler& h, LG_Error* err) const { // cerr << "Parse error on '" << h.RE << "' in " << Name // << ": " << err->Message << endl; @@ -135,37 +136,36 @@ bool LightgrepController::addUserPatterns( } } - // // Add patterns from files - // for (vector::const_iterator itr(user.Files.begin()); itr != user.Files.end(); ++itr) { - // ifstream file(itr->c_str(), ios::in); - // if (!file.is_open()) { - // cerr << "Could not open pattern file '" << *itr << "'." << endl; - // return false; - // } - // string contents = string(istreambuf_iterator(file), istreambuf_iterator()); - - // const char* contentsCStr = contents.c_str(); - // // Add all the patterns from the files in one fell swoop - // if (lg_add_pattern_list(Fsm, PatternInfo, contentsCStr, itr->c_str(), DefaultEncodingsCStrings, 2, &opts, &err) < 0) { - // vector lines; - // istringstream input(contents); - // string line; - // while (input) { - // getline(input, line); - // lines.push_back(line); - // } - // LG_Error* cur(err); - // while (cur) { - // cerr << "Error in " << *itr << ", line " << cur->Index+1 << ", pattern '" << lines[cur->Index] - // << "': " << cur->Message << endl; - // cur = cur->Next; - // } - // lg_free_error(err); - // return false; - // } - // } - // scanner.patternRange() = make_pair(patBegin, patEnd); - // Scanners.push_back(&scanner); + // Add patterns from files + for (const auto& itr : user_files) { + ifstream file(itr.c_str(), ios::in); + if (!file.is_open()) { + cerr << "Could not open pattern file '" << itr.c_str() << "'." << endl; + return false; + } + string contents = string(istreambuf_iterator(file), istreambuf_iterator()); + + const char* contentsCStr = contents.c_str(); + // Add all the patterns from the files in one fell swoop + if (lg_add_pattern_list(Fsm, contentsCStr, itr.c_str(), DefaultEncodingsCStrings, NumDefaultEncodings, &opts, &err) < 0) { + vector lines; + istringstream input(contents); + string line; + while (input) { + getline(input, line); + lines.push_back(line); + } + LG_Error* cur(err); + while (cur) { + cerr << "Error in " << itr.c_str() << ", line " << cur->Index+1 << ", pattern '" << lines[cur->Index] + << "': " << cur->Message << endl; + cur = cur->Next; + } + lg_free_error(err); + return false; + } + } + return true; } From c553d2b7e8b4ea8cc886771dfe33d398bdecade4 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 9 May 2023 19:20:23 -0400 Subject: [PATCH 17/31] Remove unused or commented code --- src/pattern_scanner.cpp | 79 +++-------------------------------------- 1 file changed, 4 insertions(+), 75 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index 616e9010..0c47a483 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -1,38 +1,22 @@ #include "config.h" -// // if liblightgrep isn't present, compiles to nothing +// if liblightgrep isn't present, compiles to nothing #ifdef HAVE_LIBLIGHTGREP -// // #include "beregex.h" -// #include "be20_api/histogram_def.h" #include "pattern_scanner.h" #include "scanner_set.h" #include -// #include -// #include -// #include -// #include - -// #include - -// #ifdef LGBENCHMARK -// #include -// #endif +#ifdef LGBENCHMARK +#include +#endif namespace { const char* DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; const unsigned int NumDefaultEncodings = 2; } - -// bool PatternScanner::handleParseError(const Handler& h, LG_Error* err) const { -// cerr << "Parse error on '" << h.RE << "' in " << Name -// << ": " << err->Message << endl; -// return false; -// } - void PatternScanner::shutdown(const scanner_params&) { // for (vector::iterator itr(Handlers.begin()); itr != Handlers.end(); ++itr) { // delete *itr; @@ -55,55 +39,6 @@ LightgrepController::~LightgrepController() { lg_destroy_program(Prog); } -// LightgrepController& LightgrepController::Get() { -// // Meyers Singleton. c.f. Effective C++ by Scott Meyers -// static LightgrepController controller; -// return controller; -// } - -// bool LightgrepController::addScanner(PatternScanner& scanner) { -// // Add patterns and handlers from a Scanner to the centralized automaton -// LG_Error* lgErr = 0; - -// unsigned int patBegin = numeric_limits::max(), -// patEnd = 0; - -// int idx = -1; - -// // iterate all the scanner's handlers -// for (vector::const_iterator h(scanner.handlers().begin()); h != scanner.handlers().end(); ++h) { -// bool good = false; -// if (lg_parse_pattern(ParsedPattern, (*h)->RE.c_str(), &(*h)->Options, &lgErr)) { // parse the pattern -// for (vector::const_iterator enc((*h)->Encodings.begin()); enc != (*h)->Encodings.end(); ++enc) { -// idx = lg_add_pattern(Fsm, PatternInfo, ParsedPattern, enc->c_str(), &lgErr); // add the pattern for each given encoding -// if (idx >= 0) { -// // add the handler callback to the pattern map, associated with the pattern index -// lg_pattern_info(PatternInfo, idx)->UserData = const_cast(static_cast(&((*h)->Callback))); -// patBegin = std::min(patBegin, static_cast(idx)); -// good = true; -// } -// } - -// // std::cerr << '\t' << (int)((*h)->Options.FixedString) << '\t' << (int)((*h)->Options.CaseInsensitive) << std::endl; -// } -// if (!good) { -// if (scanner.handleParseError(**h, lgErr)) { -// lg_free_error(lgErr); -// lgErr = 0; -// } -// else { -// return false; -// } -// } -// } -// patEnd = lg_pattern_map_size(PatternInfo); -// // record the range of this scanner's patterns in the central pattern map -// scanner.patternRange() = make_pair(patBegin, patEnd); -// Scanners.push_back(&scanner); -// return true; -// } - -/* note: findopts is now part of scanner_set.scanner_config, you need to pass that in here. */ bool LightgrepController::addUserPatterns( PatternScanner& scanner, const vector& cli_patterns, @@ -248,12 +183,6 @@ void LightgrepController::scan(const scanner_params& sp) { lg_destroy_context(ctx); } -// void LightgrepController::processHit(const vector& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb) { -// // lookup the handler's callback functor in the pattern map, then invoke it -// CallbackFnType* cbPtr(static_cast(lg_pattern_info(PatternInfo, hit.KeywordIndex)->UserData)); -// ((*sTbl[hit.KeywordIndex]).*(*cbPtr))(hit, sp, rcb); // ...yep... -// } - unsigned int LightgrepController::numPatterns() const { return Prog ? lg_prog_pattern_count(Prog) : 0; //lg_pattern_map_size(PatternInfo); } From 707466bafd6ead60b7994793fc0f37a362d83a2b Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 11 May 2023 14:42:05 -0400 Subject: [PATCH 18/31] b - lightgrep should run even if find is disabled --- src/scan_lightgrep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index ff41c90a..2cb58505 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -35,7 +35,7 @@ namespace { // local namespace hides these from other translation units sp.info->description = "Advanced search for patterns"; sp.info->scanner_version = "1.0"; sp.info->feature_defs.push_back( feature_recorder_def("lightgrep")); - // sp.info->flags = scanner_info::SCANNER_FIND_SCANNER | scanner_info::SCANNER_FAST_FIND; + sp.info->scanner_flags.find_scanner = true; // sp.info->feature_names.insert(name()); // sp.info->histogram_defs.insert(histogram_def( name(), "", "histogram", HistogramMaker::FLAG_LOWERCASE)); } From 48b9ca383bfffe86767ea06adb671448cc8fd038 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 11 May 2023 14:59:11 -0400 Subject: [PATCH 19/31] F - enable histogram for lightgrep --- src/scan_lightgrep.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index 2cb58505..2efe7457 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -36,8 +36,9 @@ namespace { // local namespace hides these from other translation units sp.info->scanner_version = "1.0"; sp.info->feature_defs.push_back( feature_recorder_def("lightgrep")); sp.info->scanner_flags.find_scanner = true; - // sp.info->feature_names.insert(name()); - // sp.info->histogram_defs.insert(histogram_def( name(), "", "histogram", HistogramMaker::FLAG_LOWERCASE)); + auto lowercase = histogram_def::flags_t(); + lowercase.lowercase = true; + sp.info->histogram_defs.insert(histogram_def(name(), name(), "", "", "histogram", lowercase)); } virtual void init(const scanner_params& sp) { From e7ac804fe087a8a58b099d9e26b4874802d6ed54 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 11 May 2023 15:09:13 -0400 Subject: [PATCH 20/31] get rid of comments & unused code --- src/pattern_scanner.cpp | 44 ++---------------------- src/pattern_scanner.h | 76 ++--------------------------------------- src/scan_lightgrep.cpp | 20 ----------- 3 files changed, 6 insertions(+), 134 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index 0c47a483..11d8ef65 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -18,16 +18,11 @@ namespace { } void PatternScanner::shutdown(const scanner_params&) { - // for (vector::iterator itr(Handlers.begin()); itr != Handlers.end(); ++itr) { - // delete *itr; - // } } -// /*********************************************************/ LightgrepController::LightgrepController() : ParsedPattern(lg_create_pattern()), // Reuse the parsed pattern data structure for efficiency Fsm(lg_create_fsm(1000, 1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed - // PatternInfo(lg_create_pattern_map(1000)), // Reserve space for 1000 patterns in the pattern map Prog(0), Scanners() { @@ -35,17 +30,15 @@ LightgrepController::LightgrepController() LightgrepController::~LightgrepController() { lg_destroy_pattern(ParsedPattern); - // lg_destroy_pattern_map(PatternInfo); lg_destroy_program(Prog); } bool LightgrepController::addUserPatterns( PatternScanner& scanner, const vector& cli_patterns, - const vector& user_files) { // CallbackFnType* callbackPtr, const FindOpts& user) { + const vector& user_files) { LG_Error *err = 0; - LG_KeyOptions opts; opts.FixedString = 0; opts.CaseInsensitive = 0; @@ -112,7 +105,6 @@ void LightgrepController::regcomp() { lg_destroy_fsm(Fsm); Fsm = 0; - // cerr << lg_pattern_map_size(PatternInfo) << " lightgrep patterns, logic size is " << lg_program_size(Prog) << " bytes, " << Scanners.size() << " active scanners" << std::endl; #ifdef LGBENCHMARK cerr << "timer second ratio " << chrono::high_resolution_clock::period::num << "/" << chrono::high_resolution_clock::period::den << endl; @@ -129,14 +121,13 @@ void gotHit(void* userData, const LG_SearchHit* hit) { // no callback, just increment hit counter ++(*static_cast(userData)); #else - // trampoline back into LightgrepController::processHit() from the void* userData HitData* data(reinterpret_cast(userData)); data->recorder.write_buf(data->sbuf, hit->Start, hit->End - hit->Start); #endif } void LightgrepController::scan(const scanner_params& sp) { - // Scan the sbuf for pattern hits, invoking various scanners' handlers as hits are encountered + // Scan the sbuf for pattern hits if (!Prog) { // we had no valid patterns, do nothing return; @@ -177,42 +168,13 @@ void LightgrepController::scan(const scanner_params& sp) { std::stringstream buf; buf << " ** Time: " << sbuf.pos0.str() << '\t' << sbuf.pagesize << '\t' << t.count() << '\t' << seconds<< '\t' << hitCount << '\t' << bw << std::endl; std::cout << buf.str(); -// std::cout.flush(); #endif lg_destroy_context(ctx); } unsigned int LightgrepController::numPatterns() const { - return Prog ? lg_prog_pattern_count(Prog) : 0; //lg_pattern_map_size(PatternInfo); + return Prog ? lg_prog_pattern_count(Prog) : 0; } -// /*********************************************************/ - -// void scan_lg(PatternScanner& scanner, class scanner_params &sp) { -// // utility implementation of the normal scan function for a PatternScanner instance -// switch (sp.phase) { -// case scanner_params::PHASE_STARTUP: -// scanner.startup(sp); -// break; -// case scanner_params::PHASE_INIT: -// scanner.init(sp); -// if (!LightgrepController::Get().addScanner(scanner)) { -// // It's fine for user patterns not to parse, but there's no excuse for a scanner so exit. -// cerr << "Aborting. Fix pattern or disable scanner to continue." << endl; -// exit(EXIT_FAILURE); -// } -// break; -// case scanner_params::PHASE_SHUTDOWN: -// scanner.shutdown(sp); -// break; -// case scanner_params::PHASE_CLEANUP: -// TODO - to something here. -// default: -// break; -// } -// } - -// /*********************************************************/ - #endif // HAVE_LIBLIGHTGREP diff --git a/src/pattern_scanner.h b/src/pattern_scanner.h index 15a874c5..4899b5d8 100644 --- a/src/pattern_scanner.h +++ b/src/pattern_scanner.h @@ -16,26 +16,11 @@ using namespace std; class PatternScanner; -// /** -// * the function prototype for a handler callback -// * LG_SearchHit - LightGrep Search Hit. -// * scanner_params - the parameters available to the scanner. -// * recursion_control_clock - information about where we are in the recursive analysis. -// */ - -// typedef void (PatternScanner::*CallbackFnType)(const LG_SearchHit&, -// const scanner_params& sp, -// const recursion_control_block& rcb); - -// /*********************************************************/ - -// struct Handler; - // // Inherit from this to create your own Lightgrep-based scanners // // clone(), startup(), init(), and initScan() must be overridden class PatternScanner { public: - PatternScanner(const string& n): Name(n) {} //Handlers(), PatternRange(0, 0) {} + PatternScanner(const string& n): Name(n) {} virtual ~PatternScanner() {} virtual PatternScanner* clone() const = 0; @@ -51,59 +36,13 @@ class PatternScanner { virtual void shutdown(const scanner_params& sp); // perform any shutdown, if necessary - // return bool indicates whether scanner addition should be continued - // default is to print message to stderr and quit parsing scanner patterns - // virtual bool handleParseError(const Handler& h, LG_Error* err) const; - - // virtual void addHandler(const Handler* h) { - // Handlers.push_back(h); - // } - - // virtual const vector& handlers() const { return Handlers; } - - // pair& patternRange() { return PatternRange; } - // const pair& patternRange() const { return PatternRange; } - protected: PatternScanner(const PatternScanner& s): - Name(s.Name) {} //, Handlers(s.Handlers), PatternRange(s.PatternRange) {} + Name(s.Name) {} string Name; - // vector Handlers; - - // pair PatternRange; // knows the label range of its associated patterns }; -// /*********************************************************/ - -// struct Handler { -// // Agglomeration of the scanner, pattern, encodings, parse options, and callback -// template -// Handler( -// PatternScanner& scanner, -// const string& re, -// const vector& encs, -// const LG_KeyOptions& opts, -// Fn fn -// ): -// RE(re), -// Encodings(encs), -// Options(opts), -// Callback(static_cast(fn)) -// { -// scanner.addHandler(this); -// } - -// string RE; - -// vector Encodings; - -// LG_KeyOptions Options; - -// CallbackFnType Callback; -// }; - -// /*********************************************************/ class LightgrepController { // Centralized search facility amongst PatternScanners public: @@ -111,32 +50,23 @@ class LightgrepController { // Centralized search facility amongst PatternScanne LightgrepController(); LightgrepController(const LightgrepController&); ~LightgrepController(); - // static LightgrepController& Get(); // singleton instance -// bool addScanner(PatternScanner& scanner); bool addUserPatterns(PatternScanner& scanner, const vector& cli_patterns, const vector& user_files); void regcomp(); void scan(const scanner_params& sp); -// void processHit(const vector& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb); unsigned int numPatterns() const; private: -// LightgrepController& operator=(const LightgrepController&); + LightgrepController& operator=(const LightgrepController&); LG_HPATTERN ParsedPattern; LG_HFSM Fsm; - // LG_HPATTERNMAP PatternInfo; LG_HPROGRAM Prog; vector Scanners; }; -// /*********************************************************/ - -// // Utility function. Makes your scan function a one-liner, given a PatternScanner instance -// void scan_lg(PatternScanner& scanner, struct scanner_params &sp; - #endif #endif /* PATTERN_SCANNER_H */ diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index 2efe7457..ddb5aede 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -44,16 +44,8 @@ namespace { // local namespace hides these from other translation units virtual void init(const scanner_params& sp) { } - virtual void initScan(const scanner_params& sp) { - // LgRec = &sp.named_feature_recorder(name()); - } - feature_recorder* LgRec; - void processHit(const LG_SearchHit& hit, const scanner_params& sp) { - // LgRec->write_buf(sp.sbuf, hit.Start, hit.End - hit.Start); - } - private: FindScanner(const FindScanner& x): PatternScanner(x), LgRec(x.LgRec) {} @@ -61,8 +53,6 @@ namespace { // local namespace hides these from other translation units }; FindScanner Scanner; - - // CallbackFnType ProcessHit; } extern "C" @@ -71,7 +61,6 @@ void scan_lightgrep(struct scanner_params &sp) { switch (sp.phase) { case scanner_params::PHASE_INIT: Scanner.startup(sp); - // ProcessHit = static_cast(&FindScanner::processHit); break; case scanner_params::PHASE_INIT2: { @@ -82,21 +71,12 @@ void scan_lightgrep(struct scanner_params &sp) { // break; } break; - // PHASE_ENABLED is never current phase when this func is called - // case scanner_params::PHASE_ENABLED: - // break; case scanner_params::PHASE_SCAN: lg_ptr->scan(sp); break; case scanner_params::PHASE_SHUTDOWN: // Scanner.shutdown(sp); break; - // no cleanup needs to happen because lightgrep controller handles dealloc - // case scanner_params::PHASE_CLEANUP: - // break; - // PHASE_CLEANED is never current phase when this func is called, used for internal bookkeeping - // case scanner_params::PHASE_CLEANED: - // break; default: break; } From 8cdc2f16a2f37117cbc9d9b0db16598790451dfc Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 11 May 2023 15:26:18 -0400 Subject: [PATCH 21/31] F - delete Fsm if not deleted by regcomp --- src/pattern_scanner.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index 11d8ef65..b1abd395 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -31,6 +31,10 @@ LightgrepController::LightgrepController() LightgrepController::~LightgrepController() { lg_destroy_pattern(ParsedPattern); lg_destroy_program(Prog); + if (Fsm) { + lg_destroy_fsm(Fsm); + Fsm = 0; + } } bool LightgrepController::addUserPatterns( From bb88ad891a37de0265377ec3b6174a215e3bf12a Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 11 May 2023 15:26:58 -0400 Subject: [PATCH 22/31] b - don't declare initScan --- src/pattern_scanner.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pattern_scanner.h b/src/pattern_scanner.h index 4899b5d8..d111fea2 100644 --- a/src/pattern_scanner.h +++ b/src/pattern_scanner.h @@ -31,7 +31,6 @@ class PatternScanner { virtual void init(const scanner_params& sp) = 0; // register handlers - virtual void initScan(const scanner_params& sp) = 0; // get feature_recorders virtual void finishScan(const scanner_params& sp) {} // done searching a region virtual void shutdown(const scanner_params& sp); // perform any shutdown, if necessary From 90bb1082e63bc604268ab46c096adc0ccfe00df5 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 11 May 2023 15:27:18 -0400 Subject: [PATCH 23/31] F - remove Scanner "global" variable --- src/scan_lightgrep.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index ddb5aede..aad2a78e 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -38,7 +38,7 @@ namespace { // local namespace hides these from other translation units sp.info->scanner_flags.find_scanner = true; auto lowercase = histogram_def::flags_t(); lowercase.lowercase = true; - sp.info->histogram_defs.insert(histogram_def(name(), name(), "", "", "histogram", lowercase)); + sp.info->histogram_defs.push_back(histogram_def(name(), name(), "", "", "histogram", lowercase)); } virtual void init(const scanner_params& sp) { @@ -51,31 +51,31 @@ namespace { // local namespace hides these from other translation units FindScanner& operator=(const FindScanner&); }; - - FindScanner Scanner; } extern "C" void scan_lightgrep(struct scanner_params &sp) { + static std::unique_ptr lg_findscanner_ptr; static std::unique_ptr lg_ptr; switch (sp.phase) { case scanner_params::PHASE_INIT: - Scanner.startup(sp); + lg_findscanner_ptr.reset(new FindScanner); + lg_findscanner_ptr->startup(sp); break; case scanner_params::PHASE_INIT2: { - Scanner.init(sp); + lg_findscanner_ptr->init(sp); lg_ptr.reset(new LightgrepController); - lg_ptr->addUserPatterns(Scanner, sp.ss->find_patterns(), sp.ss->find_files()); + lg_ptr->addUserPatterns(*lg_findscanner_ptr, sp.ss->find_patterns(), sp.ss->find_files()); lg_ptr->regcomp(); - // break; } break; case scanner_params::PHASE_SCAN: lg_ptr->scan(sp); break; case scanner_params::PHASE_SHUTDOWN: - // Scanner.shutdown(sp); + lg_findscanner_ptr.reset(); + lg_ptr.reset(); break; default: break; From 8c143b3ed89d23cc9148ef3d3f851244e2f74b2d Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 11 May 2023 15:28:40 -0400 Subject: [PATCH 24/31] remove unused includes --- src/scan_lightgrep.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index aad2a78e..e2ba68c1 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -10,8 +10,6 @@ #include "be20_api/scanner_params.h" #include "be20_api/scanner_set.h" - -//#include "be20_api/beregex.h" #include "be20_api/histogram_def.h" #include "pattern_scanner.h" From f553e621c823770bd4f794b8dc356374909b5985 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 11 May 2023 15:42:53 -0400 Subject: [PATCH 25/31] F - throw exception when lightgrep fails to parse pattern --- src/scan_lightgrep.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index e2ba68c1..57d540c3 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -64,7 +64,9 @@ void scan_lightgrep(struct scanner_params &sp) { { lg_findscanner_ptr->init(sp); lg_ptr.reset(new LightgrepController); - lg_ptr->addUserPatterns(*lg_findscanner_ptr, sp.ss->find_patterns(), sp.ss->find_files()); + if (!lg_ptr->addUserPatterns(*lg_findscanner_ptr, sp.ss->find_patterns(), sp.ss->find_files())) { + throw std::runtime_error("There was an error parsing the lightgrep scanner's patterns."); + } lg_ptr->regcomp(); } break; From 799221b697f7b5d26e5af9f6f86177f4d9d08b38 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 11 May 2023 15:43:19 -0400 Subject: [PATCH 26/31] better error messages --- src/pattern_scanner.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index b1abd395..8fdcea9b 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -62,7 +62,7 @@ bool LightgrepController::addUserPatterns( good = false; } if (!good) { - cerr << "Error on '" << itr.c_str() << "': " << err->Message << endl; + cerr << "Lightgrep error parsing '" << itr.c_str() << "': " << err->Message << endl; lg_free_error(err); return false; } @@ -72,7 +72,7 @@ bool LightgrepController::addUserPatterns( for (const auto& itr : user_files) { ifstream file(itr.c_str(), ios::in); if (!file.is_open()) { - cerr << "Could not open pattern file '" << itr.c_str() << "'." << endl; + cerr << "Lightgrep scanner could not open pattern file '" << itr.c_str() << "'." << endl; return false; } string contents = string(istreambuf_iterator(file), istreambuf_iterator()); @@ -89,7 +89,7 @@ bool LightgrepController::addUserPatterns( } LG_Error* cur(err); while (cur) { - cerr << "Error in " << itr.c_str() << ", line " << cur->Index+1 << ", pattern '" << lines[cur->Index] + cerr << "Lightgrep parsing error in " << itr.c_str() << ", on line " << cur->Index+1 << ", on pattern '" << lines[cur->Index] << "': " << cur->Message << endl; cur = cur->Next; } From 0ca43ec499e81db3bb9f39cfbf7932ac9698662c Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Thu, 11 May 2023 21:09:58 -0400 Subject: [PATCH 27/31] F - avoid unnecessary repeated heap alloc/dealloc --- src/pattern_scanner.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index 8fdcea9b..ee1f76d7 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -20,6 +20,14 @@ namespace { void PatternScanner::shutdown(const scanner_params&) { } +struct LgContextHolder { + LG_HCONTEXT Ctx; + + LgContextHolder(LG_HPROGRAM Prog, LG_ContextOptions* ctxOpts) {Ctx = lg_create_context(Prog, ctxOpts);} + ~LgContextHolder() {lg_destroy_context(Ctx);} + +}; + LightgrepController::LightgrepController() : ParsedPattern(lg_create_pattern()), // Reuse the parsed pattern data structure for efficiency Fsm(lg_create_fsm(1000, 1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed @@ -140,8 +148,9 @@ void LightgrepController::scan(const scanner_params& sp) { LG_ContextOptions ctxOpts; ctxOpts.TraceBegin = 0xffffffffffffffff; ctxOpts.TraceEnd = 0; - - LG_HCONTEXT ctx = lg_create_context(Prog, &ctxOpts); // create a search context; cannot be shared, so local to scan + thread_local LgContextHolder ctx(Prog, &ctxOpts); + lg_reset_context(ctx.Ctx); + //LG_HCONTEXT ctx = lg_create_context(Prog, &ctxOpts); // create a search context; cannot be shared, so local to scan const sbuf_t &sbuf = *sp.sbuf; HitData callbackInfo = { sp.named_feature_recorder("lightgrep"), *sp.sbuf }; @@ -157,12 +166,12 @@ void LightgrepController::scan(const scanner_params& sp) { // search the sbuf in one go // the gotHit() function will be invoked for each pattern hit - if (lg_search(ctx, (const char*)sbuf.get_buf(), (const char*)sbuf.get_buf() + sbuf.pagesize, 0, userData, gotHit) < numeric_limits::max()) { + if (lg_search(ctx.Ctx, (const char*)sbuf.get_buf(), (const char*)sbuf.get_buf() + sbuf.pagesize, 0, userData, gotHit) < numeric_limits::max()) { // resolve potential hits that want data into the sbuf margin, without beginning any new hits - lg_search_resolve(ctx, (const char*)sbuf.get_buf() + sbuf.pagesize, (const char*)sbuf.get_buf() + sbuf.bufsize, sbuf.pagesize, userData, gotHit); + lg_search_resolve(ctx.Ctx, (const char*)sbuf.get_buf() + sbuf.pagesize, (const char*)sbuf.get_buf() + sbuf.bufsize, sbuf.pagesize, userData, gotHit); } // flush any remaining hits; there's no more data - lg_closeout_search(ctx, userData, gotHit); + lg_closeout_search(ctx.Ctx, userData, gotHit); #ifdef LGBENCHMARK auto endClock = std::chrono::high_resolution_clock::now(); @@ -174,7 +183,7 @@ void LightgrepController::scan(const scanner_params& sp) { std::cout << buf.str(); #endif - lg_destroy_context(ctx); + //lg_destroy_context(ctx); } unsigned int LightgrepController::numPatterns() const { From 007b9c9b4ce36c5c0ec02570a9bc2b84f6c207d7 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 30 May 2023 14:32:28 -0400 Subject: [PATCH 28/31] F!! - Reverting use of thread local because it seems to cause a performance regression. The timings below are from the following command: ./src/bulk_extractor -F ../lightgrep/pytest/keys/shuf10.txt -Z -o ~/be_timed_output_without_thread_local_`printf %04d $i` -E scan_lightgrep ~/ev/terry-2009-12-11-002.E01 Thread_local? Clocktime (Min.) Clocktime (Max.) Clocktime (Average) Scan Lightgrep Time (Min.) Scan Lightgrep Time (Max.) Scan Lightgrep Time (Average) FALSE 162.965479 168.628229 164.2545712 494.810946 528.368114 504.1799554 TRUE 163.681386 173.587754 167.233617 499.815450 532.324335 516.4901762 This reverts commit 0ca43ec499e81db3bb9f39cfbf7932ac9698662c. --- src/pattern_scanner.cpp | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index ee1f76d7..8fdcea9b 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -20,14 +20,6 @@ namespace { void PatternScanner::shutdown(const scanner_params&) { } -struct LgContextHolder { - LG_HCONTEXT Ctx; - - LgContextHolder(LG_HPROGRAM Prog, LG_ContextOptions* ctxOpts) {Ctx = lg_create_context(Prog, ctxOpts);} - ~LgContextHolder() {lg_destroy_context(Ctx);} - -}; - LightgrepController::LightgrepController() : ParsedPattern(lg_create_pattern()), // Reuse the parsed pattern data structure for efficiency Fsm(lg_create_fsm(1000, 1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed @@ -148,9 +140,8 @@ void LightgrepController::scan(const scanner_params& sp) { LG_ContextOptions ctxOpts; ctxOpts.TraceBegin = 0xffffffffffffffff; ctxOpts.TraceEnd = 0; - thread_local LgContextHolder ctx(Prog, &ctxOpts); - lg_reset_context(ctx.Ctx); - //LG_HCONTEXT ctx = lg_create_context(Prog, &ctxOpts); // create a search context; cannot be shared, so local to scan + + LG_HCONTEXT ctx = lg_create_context(Prog, &ctxOpts); // create a search context; cannot be shared, so local to scan const sbuf_t &sbuf = *sp.sbuf; HitData callbackInfo = { sp.named_feature_recorder("lightgrep"), *sp.sbuf }; @@ -166,12 +157,12 @@ void LightgrepController::scan(const scanner_params& sp) { // search the sbuf in one go // the gotHit() function will be invoked for each pattern hit - if (lg_search(ctx.Ctx, (const char*)sbuf.get_buf(), (const char*)sbuf.get_buf() + sbuf.pagesize, 0, userData, gotHit) < numeric_limits::max()) { + if (lg_search(ctx, (const char*)sbuf.get_buf(), (const char*)sbuf.get_buf() + sbuf.pagesize, 0, userData, gotHit) < numeric_limits::max()) { // resolve potential hits that want data into the sbuf margin, without beginning any new hits - lg_search_resolve(ctx.Ctx, (const char*)sbuf.get_buf() + sbuf.pagesize, (const char*)sbuf.get_buf() + sbuf.bufsize, sbuf.pagesize, userData, gotHit); + lg_search_resolve(ctx, (const char*)sbuf.get_buf() + sbuf.pagesize, (const char*)sbuf.get_buf() + sbuf.bufsize, sbuf.pagesize, userData, gotHit); } // flush any remaining hits; there's no more data - lg_closeout_search(ctx.Ctx, userData, gotHit); + lg_closeout_search(ctx, userData, gotHit); #ifdef LGBENCHMARK auto endClock = std::chrono::high_resolution_clock::now(); @@ -183,7 +174,7 @@ void LightgrepController::scan(const scanner_params& sp) { std::cout << buf.str(); #endif - //lg_destroy_context(ctx); + lg_destroy_context(ctx); } unsigned int LightgrepController::numPatterns() const { From fb0e42e03cd8471fe5b42d0bd005b9f3e49e9631 Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 30 May 2023 14:40:44 -0400 Subject: [PATCH 29/31] F - update scanner name and version --- src/scan_lightgrep.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index 57d540c3..a39f3f9d 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -28,10 +28,10 @@ namespace { // local namespace hides these from other translation units }; virtual void startup(const scanner_params& sp) { - sp.info->set_name("scan_lightgrep"); + sp.info->set_name("lightgrep"); sp.info->author = "Jon Stewart"; sp.info->description = "Advanced search for patterns"; - sp.info->scanner_version = "1.0"; + sp.info->scanner_version = "2.0"; sp.info->feature_defs.push_back( feature_recorder_def("lightgrep")); sp.info->scanner_flags.find_scanner = true; auto lowercase = histogram_def::flags_t(); From 5e55041d47280a29376a155716782a53309fb8fc Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 30 May 2023 14:41:36 -0400 Subject: [PATCH 30/31] a - Delete superfluous whitespace --- src/scan_lightgrep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index a39f3f9d..38f3b065 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -32,7 +32,7 @@ namespace { // local namespace hides these from other translation units sp.info->author = "Jon Stewart"; sp.info->description = "Advanced search for patterns"; sp.info->scanner_version = "2.0"; - sp.info->feature_defs.push_back( feature_recorder_def("lightgrep")); + sp.info->feature_defs.push_back(feature_recorder_def("lightgrep")); sp.info->scanner_flags.find_scanner = true; auto lowercase = histogram_def::flags_t(); lowercase.lowercase = true; From 16e8eeb12265d779122b8c2bea0f0a9f5df9714c Mon Sep 17 00:00:00 2001 From: Julia Paluch Date: Tue, 30 May 2023 14:47:01 -0400 Subject: [PATCH 31/31] a - formatting, fix inaccurate comments --- src/pattern_scanner.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pattern_scanner.h b/src/pattern_scanner.h index d111fea2..a85535e6 100644 --- a/src/pattern_scanner.h +++ b/src/pattern_scanner.h @@ -17,7 +17,7 @@ using namespace std; class PatternScanner; // // Inherit from this to create your own Lightgrep-based scanners -// // clone(), startup(), init(), and initScan() must be overridden +// // clone(), startup(), and init() must be overridden class PatternScanner { public: PatternScanner(const string& n): Name(n) {} @@ -39,7 +39,7 @@ class PatternScanner { PatternScanner(const PatternScanner& s): Name(s.Name) {} - string Name; + string Name; };