diff --git a/src/Makefile.am b/src/Makefile.am index 62d85038..37634f5a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -57,15 +57,11 @@ scan_accts.o: sbuf_flex_scanner.h scan_email.o: sbuf_flex_scanner.h scan_gps.o: sbuf_flex_scanner.h -# These scanners are based on Lightbox Technology's lightgrep +# These scanners are based on Stroz Friedberg's lightgrep lightgrep_scanners = \ pattern_scanner.cpp pattern_scanner.h \ pattern_scanner_utils.cpp pattern_scanner_utils.h \ - scan_lightgrep.cpp \ - scan_accts_lg.cpp \ - scan_base16_lg.cpp \ - scan_email_lg.cpp \ - scan_gps_lg.cpp + scan_lightgrep.cpp # scanners_builtin are the scanners that are compiled into the binary diff --git a/src/bulk_extractor_scanners.h b/src/bulk_extractor_scanners.h index 1ac62e1b..cb612273 100644 --- a/src/bulk_extractor_scanners.h +++ b/src/bulk_extractor_scanners.h @@ -64,9 +64,5 @@ SCANNER(zip) #ifdef HAVE_LIBLIGHTGREP -//SCANNER(accts_lg) -//SCANNER(base16_lg) -//SCANNER(email_lg) -//SCANNER(gps_lg) -//SCANNER(lightgrep) +SCANNER(lightgrep) #endif diff --git a/src/pattern_scanner.cpp b/src/pattern_scanner.cpp index 0f188128..8fdcea9b 100644 --- a/src/pattern_scanner.cpp +++ b/src/pattern_scanner.cpp @@ -3,19 +3,11 @@ // if liblightgrep isn't present, compiles to nothing #ifdef HAVE_LIBLIGHTGREP -#include "beregex.h" -#include "histogram.h" #include "pattern_scanner.h" +#include "scanner_set.h" #include -#include -#include -#include -#include - -#include - #ifdef LGBENCHMARK #include #endif @@ -25,23 +17,12 @@ namespace { const unsigned int NumDefaultEncodings = 2; } -bool PatternScanner::handleParseError(const Handler& h, LG_Error* err) const { - cerr << "Parse error on '" << h.RE << "' in " << Name - << ": " << err->Message << endl; - return false; -} - void PatternScanner::shutdown(const scanner_params&) { - for (vector::iterator itr(Handlers.begin()); itr != Handlers.end(); ++itr) { - delete *itr; - } } -/*********************************************************/ LightgrepController::LightgrepController() : ParsedPattern(lg_create_pattern()), // Reuse the parsed pattern data structure for efficiency - Fsm(lg_create_fsm(1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed - PatternInfo(lg_create_pattern_map(1000)), // Reserve space for 1000 patterns in the pattern map + Fsm(lg_create_fsm(1000, 1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed Prog(0), Scanners() { @@ -49,83 +30,56 @@ LightgrepController::LightgrepController() LightgrepController::~LightgrepController() { lg_destroy_pattern(ParsedPattern); - lg_destroy_pattern_map(PatternInfo); lg_destroy_program(Prog); + if (Fsm) { + lg_destroy_fsm(Fsm); + Fsm = 0; + } } -LightgrepController& LightgrepController::Get() { - // Meyers Singleton. c.f. Effective C++ by Scott Meyers - static LightgrepController controller; - return controller; -} - -bool LightgrepController::addScanner(PatternScanner& scanner) { - // Add patterns and handlers from a Scanner to the centralized automaton - LG_Error* lgErr = 0; +bool LightgrepController::addUserPatterns( + PatternScanner& scanner, + const vector& cli_patterns, + const vector& user_files) { - unsigned int patBegin = numeric_limits::max(), - patEnd = 0; + LG_Error *err = 0; + LG_KeyOptions opts; + opts.FixedString = 0; + opts.CaseInsensitive = 0; - int idx = -1; + bool good = true; - // iterate all the scanner's handlers - for (vector::const_iterator h(scanner.handlers().begin()); h != scanner.handlers().end(); ++h) { - bool good = false; - if (lg_parse_pattern(ParsedPattern, (*h)->RE.c_str(), &(*h)->Options, &lgErr)) { // parse the pattern - for (vector::const_iterator enc((*h)->Encodings.begin()); enc != (*h)->Encodings.end(); ++enc) { - idx = lg_add_pattern(Fsm, PatternInfo, ParsedPattern, enc->c_str(), &lgErr); // add the pattern for each given encoding - if (idx >= 0) { - // add the handler callback to the pattern map, associated with the pattern index - lg_pattern_info(PatternInfo, idx)->UserData = const_cast(static_cast(&((*h)->Callback))); - patBegin = std::min(patBegin, static_cast(idx)); - good = true; + // add patterns from single command-line arguments + for (const auto& itr : cli_patterns) { + if (lg_parse_pattern(ParsedPattern, itr.c_str(), &opts, &err)) { + for (unsigned int i = 0; i < NumDefaultEncodings; ++i) { + if (lg_add_pattern(Fsm, ParsedPattern, DefaultEncodingsCStrings[i], 0, &err) < 0) { + good = false; + break; } } - -// std::cerr << '\t' << (int)((*h)->Options.FixedString) << '\t' << (int)((*h)->Options.CaseInsensitive) << std::endl; + } else { + good = false; } if (!good) { - if (scanner.handleParseError(**h, lgErr)) { - lg_free_error(lgErr); - lgErr = 0; - } - else { - return false; - } + cerr << "Lightgrep error parsing '" << itr.c_str() << "': " << err->Message << endl; + lg_free_error(err); + return false; } } - patEnd = lg_pattern_map_size(PatternInfo); - // record the range of this scanner's patterns in the central pattern map - scanner.patternRange() = make_pair(patBegin, patEnd); - Scanners.push_back(&scanner); - return true; -} - -/* note: findopts is now part of scanner_set.scanner_config, you need to pass that in here. */ -bool LightgrepController::addUserPatterns(PatternScanner& scanner, CallbackFnType* callbackPtr, const FindOpts& user) { - // Add patterns specified as keywords by the user - // Similar to above, but does not have a handler per pattern - unsigned int patBegin = lg_pattern_map_size(PatternInfo), - patEnd = 0; - - LG_KeyOptions opts; - opts.FixedString = 0; - opts.CaseInsensitive = 0; - - LG_Error *err = 0; // Add patterns from files - for (vector::const_iterator itr(user.Files.begin()); itr != user.Files.end(); ++itr) { - ifstream file(itr->c_str(), ios::in); + for (const auto& itr : user_files) { + ifstream file(itr.c_str(), ios::in); if (!file.is_open()) { - cerr << "Could not open pattern file '" << *itr << "'." << endl; + cerr << "Lightgrep scanner could not open pattern file '" << itr.c_str() << "'." << endl; return false; } string contents = string(istreambuf_iterator(file), istreambuf_iterator()); const char* contentsCStr = contents.c_str(); // Add all the patterns from the files in one fell swoop - if (lg_add_pattern_list(Fsm, PatternInfo, contentsCStr, itr->c_str(), DefaultEncodingsCStrings, 2, &opts, &err) < 0) { + if (lg_add_pattern_list(Fsm, contentsCStr, itr.c_str(), DefaultEncodingsCStrings, NumDefaultEncodings, &opts, &err) < 0) { vector lines; istringstream input(contents); string line; @@ -135,7 +89,7 @@ bool LightgrepController::addUserPatterns(PatternScanner& scanner, CallbackFnTyp } LG_Error* cur(err); while (cur) { - cerr << "Error in " << *itr << ", line " << cur->Index+1 << ", pattern '" << lines[cur->Index] + cerr << "Lightgrep parsing error in " << itr.c_str() << ", on line " << cur->Index+1 << ", on pattern '" << lines[cur->Index] << "': " << cur->Message << endl; cur = cur->Next; } @@ -143,39 +97,18 @@ bool LightgrepController::addUserPatterns(PatternScanner& scanner, CallbackFnTyp return false; } } - // add patterns from single command-line arguments - for (vector::const_iterator itr(user.Patterns.begin()); itr != user.Patterns.end(); ++itr) { - bool good = false; - if (lg_parse_pattern(ParsedPattern, itr->c_str(), &opts, &err)) { - for (unsigned int i = 0; i < NumDefaultEncodings; ++i) { - if (lg_add_pattern(Fsm, PatternInfo, ParsedPattern, DefaultEncodingsCStrings[i], &err) >= 0) { - good = true; - } - } - } - if (!good) { - cerr << "Error on '" << *itr << "': " << err->Message << endl; - lg_free_error(err); - return false; - } - } - patEnd = lg_pattern_map_size(PatternInfo); - for (unsigned int i = patBegin; i < patEnd; ++i) { - lg_pattern_info(PatternInfo, i)->UserData = const_cast(static_cast(callbackPtr)); - } - scanner.patternRange() = make_pair(patBegin, patEnd); - Scanners.push_back(&scanner); + return true; } void LightgrepController::regcomp() { LG_ProgramOptions progOpts; - progOpts.Determinize = 1; + progOpts.DeterminizeDepth = 10; // Create an optimized, immutable form of the accumulated automaton Prog = lg_create_program(Fsm, &progOpts); lg_destroy_fsm(Fsm); + Fsm = 0; - cerr << lg_pattern_map_size(PatternInfo) << " lightgrep patterns, logic size is " << lg_program_size(Prog) << " bytes, " << Scanners.size() << " active scanners" << std::endl; #ifdef LGBENCHMARK cerr << "timer second ratio " << chrono::high_resolution_clock::period::num << "/" << chrono::high_resolution_clock::period::den << endl; @@ -183,11 +116,8 @@ void LightgrepController::regcomp() { } struct HitData { - // Everything we need for processing a hit - LightgrepController* lgc; - const vector* scannerTable; - const scanner_params* sp; - //const recursion_control_block* rcb; + feature_recorder &recorder; + const sbuf_t &sbuf; }; void gotHit(void* userData, const LG_SearchHit* hit) { @@ -195,38 +125,26 @@ void gotHit(void* userData, const LG_SearchHit* hit) { // no callback, just increment hit counter ++(*static_cast(userData)); #else - // trampoline back into LightgrepController::processHit() from the void* userData - HitData* hd(static_cast(userData)); - hd->lgc->processHit(*hd->scannerTable, *hit, *hd->sp, *hd->rcb); + HitData* data(reinterpret_cast(userData)); + data->recorder.write_buf(data->sbuf, hit->Start, hit->End - hit->Start); #endif } -void LightgrepController::scan(const scanner_params& sp, const recursion_control_block &rcb) { - // Scan the sbuf for pattern hits, invoking various scanners' handlers as hits are encountered +void LightgrepController::scan(const scanner_params& sp) { + // Scan the sbuf for pattern hits if (!Prog) { // we had no valid patterns, do nothing return; } - // First, clone all the scanners so that there's no shared data between threads - vector scannerTable(lg_pattern_map_size(PatternInfo)); // [Keyword Index -> scanner], no ownership - vector scannerList; // ownership list - for (vector::const_iterator itr(Scanners.begin()); itr != Scanners.end(); ++itr) { - PatternScanner *s = (*itr)->clone(); - scannerList.push_back(s); - for (unsigned int i = s->patternRange().first; i < s->patternRange().second; ++i) { - scannerTable[i] = s; - } - s->initScan(sp); // let the scanner know we're about to scan an sbuf - } + LG_ContextOptions ctxOpts; ctxOpts.TraceBegin = 0xffffffffffffffff; ctxOpts.TraceEnd = 0; LG_HCONTEXT ctx = lg_create_context(Prog, &ctxOpts); // create a search context; cannot be shared, so local to scan - const sbuf_t &sbuf = sp.sbuf; - - HitData callbackInfo = { this, &scannerTable, &sp, &rcb }; + const sbuf_t &sbuf = *sp.sbuf; + HitData callbackInfo = { sp.named_feature_recorder("lightgrep"), *sp.sbuf }; void* userData = &callbackInfo; #ifdef LGBENCHMARK // perform timings of lightgrep search functions only -- no callbacks @@ -239,9 +157,9 @@ void LightgrepController::scan(const scanner_params& sp, const recursion_control // search the sbuf in one go // the gotHit() function will be invoked for each pattern hit - if (lg_search(ctx, (const char*)sbuf.buf, (const char*)sbuf.buf + sbuf.pagesize, 0, userData, gotHit) < numeric_limits::max()) { + if (lg_search(ctx, (const char*)sbuf.get_buf(), (const char*)sbuf.get_buf() + sbuf.pagesize, 0, userData, gotHit) < numeric_limits::max()) { // resolve potential hits that want data into the sbuf margin, without beginning any new hits - lg_search_resolve(ctx, (const char*)sbuf.buf + sbuf.pagesize, (const char*)sbuf.buf + sbuf.bufsize, sbuf.pagesize, userData, gotHit); + lg_search_resolve(ctx, (const char*)sbuf.get_buf() + sbuf.pagesize, (const char*)sbuf.get_buf() + sbuf.bufsize, sbuf.pagesize, userData, gotHit); } // flush any remaining hits; there's no more data lg_closeout_search(ctx, userData, gotHit); @@ -254,54 +172,13 @@ void LightgrepController::scan(const scanner_params& sp, const recursion_control std::stringstream buf; buf << " ** Time: " << sbuf.pos0.str() << '\t' << sbuf.pagesize << '\t' << t.count() << '\t' << seconds<< '\t' << hitCount << '\t' << bw << std::endl; std::cout << buf.str(); -// std::cout.flush(); #endif lg_destroy_context(ctx); - - // don't call PatternScanner::shutdown() on these! that only happens on prototypes - for (vector::const_iterator itr(scannerList.begin()); itr != scannerList.end(); ++itr) { - (*itr)->finishScan(sp); // let the scanner know we're done with the sbuf - delete *itr; - } -} - -void LightgrepController::processHit(const vector& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb) { - // lookup the handler's callback functor in the pattern map, then invoke it - CallbackFnType* cbPtr(static_cast(lg_pattern_info(PatternInfo, hit.KeywordIndex)->UserData)); - ((*sTbl[hit.KeywordIndex]).*(*cbPtr))(hit, sp, rcb); // ...yep... } unsigned int LightgrepController::numPatterns() const { - return lg_pattern_map_size(PatternInfo); + return Prog ? lg_prog_pattern_count(Prog) : 0; } -/*********************************************************/ - -void scan_lg(PatternScanner& scanner, class scanner_params &sp) { - // utility implementation of the normal scan function for a PatternScanner instance - switch (sp.phase) { - case scanner_params::PHASE_STARTUP: - scanner.startup(sp); - break; - case scanner_params::PHASE_INIT: - scanner.init(sp); - if (!LightgrepController::Get().addScanner(scanner)) { - // It's fine for user patterns not to parse, but there's no excuse for a scanner so exit. - cerr << "Aborting. Fix pattern or disable scanner to continue." << endl; - exit(EXIT_FAILURE); - } - break; - case scanner_params::PHASE_SHUTDOWN: - scanner.shutdown(sp); - break; - case scanner_params::PHASE_CLEANUP: - TODO - to something here. - default: - break; - } -} - -/*********************************************************/ - #endif // HAVE_LIBLIGHTGREP diff --git a/src/pattern_scanner.h b/src/pattern_scanner.h index 4a6d4268..a85535e6 100644 --- a/src/pattern_scanner.h +++ b/src/pattern_scanner.h @@ -10,32 +10,17 @@ #include -#include "be13/plugin.h" +#include "be20_api/scanner_params.h" using namespace std; class PatternScanner; -/** - * the function prototype for a handler callback - * LG_SearchHit - LightGrep Search Hit. - * scanner_params - the parameters available to the scanner. - * recursion_control_clock - information about where we are in the recursive analysis. - */ - -typedef void (PatternScanner::*CallbackFnType)(const LG_SearchHit&, - const scanner_params& sp, - const recursion_control_block& rcb); - -/*********************************************************/ - -struct Handler; - -// Inherit from this to create your own Lightgrep-based scanners -// clone(), startup(), init(), and initScan() must be overridden +// // Inherit from this to create your own Lightgrep-based scanners +// // clone(), startup(), and init() must be overridden class PatternScanner { public: - PatternScanner(const string& n): Name(n), Handlers(), PatternRange(0, 0) {} + PatternScanner(const string& n): Name(n) {} virtual ~PatternScanner() {} virtual PatternScanner* clone() const = 0; @@ -46,98 +31,41 @@ class PatternScanner { virtual void init(const scanner_params& sp) = 0; // register handlers - virtual void initScan(const scanner_params& sp) = 0; // get feature_recorders virtual void finishScan(const scanner_params& sp) {} // done searching a region virtual void shutdown(const scanner_params& sp); // perform any shutdown, if necessary - // return bool indicates whether scanner addition should be continued - // default is to print message to stderr and quit parsing scanner patterns - virtual bool handleParseError(const Handler& h, LG_Error* err) const; - - virtual void addHandler(const Handler* h) { - Handlers.push_back(h); - } - - virtual const vector& handlers() const { return Handlers; } - - pair& patternRange() { return PatternRange; } - const pair& patternRange() const { return PatternRange; } - protected: PatternScanner(const PatternScanner& s): - Name(s.Name), Handlers(s.Handlers), PatternRange(s.PatternRange) {} - - string Name; - vector Handlers; - - pair PatternRange; // knows the label range of its associated patterns -}; + Name(s.Name) {} -/*********************************************************/ - -struct Handler { - // Agglomeration of the scanner, pattern, encodings, parse options, and callback - template - Handler( - PatternScanner& scanner, - const string& re, - const vector& encs, - const LG_KeyOptions& opts, - Fn fn - ): - RE(re), - Encodings(encs), - Options(opts), - Callback(static_cast(fn)) - { - scanner.addHandler(this); - } - - string RE; - - vector Encodings; - - LG_KeyOptions Options; - - CallbackFnType Callback; + string Name; }; -/*********************************************************/ class LightgrepController { // Centralized search facility amongst PatternScanners public: - static LightgrepController& Get(); // singleton instance + LightgrepController(); + LightgrepController(const LightgrepController&); + ~LightgrepController(); - bool addScanner(PatternScanner& scanner); - bool addUserPatterns(PatternScanner& scanner, CallbackFnType* callbackPtr, const FindOpts& userPatterns); + bool addUserPatterns(PatternScanner& scanner, const vector& cli_patterns, const vector& user_files); void regcomp(); - void scan(const scanner_params& sp, const recursion_control_block& rcb); - void processHit(const vector& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb); + void scan(const scanner_params& sp); unsigned int numPatterns() const; private: - LightgrepController(); - LightgrepController(const LightgrepController&); - ~LightgrepController(); - - LightgrepController& operator=(const LightgrepController&); + LightgrepController& operator=(const LightgrepController&); LG_HPATTERN ParsedPattern; LG_HFSM Fsm; - LG_HPATTERNMAP PatternInfo; LG_HPROGRAM Prog; vector Scanners; }; -/*********************************************************/ - -// Utility function. Makes your scan function a one-liner, given a PatternScanner instance -void scan_lg(PatternScanner& scanner, struct scanner_params &sp; - #endif #endif /* PATTERN_SCANNER_H */ diff --git a/src/scan_accts_lg.cpp b/src/scan_accts_lg.cpp deleted file mode 100644 index be241eaf..00000000 --- a/src/scan_accts_lg.cpp +++ /dev/null @@ -1,749 +0,0 @@ -#include "config.h" - -// if liblightgrep isn't present, compiles to nothing -#ifdef HAVE_LIBLIGHTGREP - -#include -#include - -#include "be20_api/scanner_params.h" -#include "histogram.h" -#include "scan_ccns2.h" -#include "pattern_scanner.h" -#include "pattern_scanner_utils.h" - -namespace accts { - const char* const DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; - - const vector DefaultEncodings( - DefaultEncodingsCStrings, - DefaultEncodingsCStrings + - sizeof(DefaultEncodingsCStrings)/sizeof(DefaultEncodingsCStrings[0]) - ); - - const vector OnlyUTF8Encoding(1, "UTF-8"); - - const vector OnlyUTF16LEEncoding(1, "UTF-16LE"); - - const LG_KeyOptions DefaultOptions = { 0, 1 }; // patterns, case-insensitive - - // - // helper functions - // - - bool is_pdf_box(const sbuf_t& sbuf, size_t pos) { - const char box[] = "Box"; - const size_t c0 = pos >= 10 ? pos - 10 : 10 - pos - 1; - const uint8_t* i = search(sbuf.buf + c0, sbuf.buf + pos, box, box + strlen(box)); - return i != sbuf.buf + pos; -/* - return i != sbuf.buf + pos && ( - (i + 2 < sbuf.buf + pos && *(i+1) == ' ' && *(i+2) == '[') - || *(i+1) == '[' - ); -*/ - } - - inline bool valid_char(char ch) { - return isdigit(ch) || isspace(ch) || ch=='[' || ch==']' || - ch=='<' || ch=='Z' || ch=='.' || ch=='l' || ch=='j'; - } - - bool valid_phone_utf16le(const sbuf_t& sbuf, size_t pos, size_t len) { - // We want invalid characters before and after (assuming there is a - // before and after) - bool invalid_before = false; - bool invalid_after = false; - - if (pos > 16) { - for (size_t i = pos-16; i < pos; ++i) { - if (sbuf[i] != '\0' && !valid_char(sbuf[i])) { - invalid_before = true; - break; - } - } - } - else { - invalid_before = true; - } - - if (sbuf.bufsize < pos+len+16) { - for (size_t i = pos+len; i < pos+len+16; ++i) { - if (sbuf[i] != '\0' && !valid_char(sbuf[i])) { - invalid_after = true; - break; - } - } - } - else { - invalid_after = true; - } - - /* - * 2013-05-28: if followed by ' #{1,5} ' then it's not a phone either! - */ - if (pos+len+10 < sbuf.bufsize) { - if (sbuf[pos+len] == ' ' && sbuf[pos+len+1] == '\0' && - isdigit(sbuf[pos+len+2]) && sbuf[pos+len+3] == '\0') { - for (size_t i = pos+len+2; i+3 < sbuf.bufsize && i < pos+len+16; i += 2) { - if (isdigit(sbuf[i]) && sbuf[i+1] == '\0' && - sbuf[i+2] == ' ' && sbuf[i+3] == '\0') { - return false; // not valid - } - } - } - } - - /* If it is followed by a dash and a number, it's not a phone number */ - if (pos+len+4 < sbuf.bufsize) { - if (sbuf[pos+len] == '-' && sbuf[pos+len+1] == '\0' && - isdigit(sbuf[pos+len+2] && sbuf[pos+len+3] == '\0')) { - return false; - } - } - - return invalid_before && invalid_after; - } - - // - // subpatterns - // - -// const string END("([^0-9e.]|(\\.[^0-9]))"); - const string END("([^\\z2E\\z30-\\z39\\z45\\z65]|(\\.[^\\z30-\\z39]))"); - const string BLOCK("[0-9]{4}"); - const string DELIM("[- ]"); - const string DB("(" + BLOCK + DELIM + ")"); - const string SDB("([45][0-9]{3}" + DELIM + ")"); - const string TDEL("[ /.-]"); - - const string PHONETEXT_UTF8_CTX("[^\\z41-\\z5A\\z61-\\z7A]"); - const string PHONETEXT_UTF16LE_CTX("([^\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])"); - const string PHONETEXT_COMMON("(tel[.ephon]*|fax|facsimile|DSN|telex|TTD|mobile|cell):?"); - const string PHONETEXT_UTF8("(" + PHONETEXT_UTF8_CTX + PHONETEXT_COMMON + ")"); - const string PHONETEXT_UTF16LE("(" + PHONETEXT_UTF16LE_CTX + PHONETEXT_COMMON + ")"); - - const string YEAR("(19[0-9][0-9]|20[01][0-9])"); - const string MONTH("(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?|0?[1-9]|1[0-2])"); - const string DAY("([0-2]?[0-9]|3[01])"); - - const string SYEAR("([0-9][0-9])"); - const string SMONTH("([01][0-2])"); - - const string DATEA("(" + YEAR + "-" + MONTH + "-" + DAY + ")"); - const string DATEB("(" + YEAR + "/" + MONTH + "/" + DAY + ")"); - const string DATEC("(" + DAY + " " + MONTH + " " + YEAR + ")"); - const string DATED("(" + MONTH + " " + DAY + "[, ]+" + YEAR + ")"); - - const string DATEFORMAT("(" + DATEA + "|" + DATEB + "|" + DATEC + "|" + DATED + ")"); - - // - // the scaner - // - - class Scanner: public PatternScanner { - public: - Scanner(): PatternScanner("accts_lg"), CCN_Recorder(0), CCN_Track2_Recorder(0), Telephone_Recorder(0), Alert_Recorder(0), PII_Recorder(0), SIN_Recorder(0) {} - virtual ~Scanner() {} - - virtual Scanner* clone() const { return new Scanner(*this); } - - virtual void startup(const scanner_params& sp); - virtual void init(const scanner_params& sp); - virtual void initScan(const scanner_params&); - - feature_recorder* CCN_Recorder; - feature_recorder* CCN_Track2_Recorder; - feature_recorder* Telephone_Recorder; - feature_recorder* Alert_Recorder; - feature_recorder* PII_Recorder; - feature_recorder* SIN_Recorder; - - void ccnHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void ccnUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void ccnTrack2HitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void ccnTrack2UTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void telephoneHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void telephoneUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void telephoneTrailingCtxHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void telephoneTrailingCtxUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void validatedTelephoneHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void validatedTelephoneUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void bitlockerHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void bitlockerUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void piiHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void piiUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void sinHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void sinUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void sinHitHandler2(const LG_SearchHit& hit, const scanner_params& sp); - - void sinUTF16LEHitHandler2(const LG_SearchHit& hit, const scanner_params& sp); - - void dateHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - private: - Scanner(const Scanner& s): - PatternScanner(s), - CCN_Recorder(s.CCN_Recorder), - CCN_Track2_Recorder(s.CCN_Track2_Recorder), - Telephone_Recorder(s.Telephone_Recorder), - Alert_Recorder(s.Alert_Recorder), - PII_Recorder(s.PII_Recorder), - SIN_Recorder(s.SIN_Recorder) - {} - - Scanner& operator=(const Scanner&); - }; - - void Scanner::startup(const scanner_params& sp) { - sp.check_version(); - - sp.info->name = "accts_lg"; - sp.info->author = "Simson L. Garfinkel, modified by Tim Walsh"; - sp.info->description = "scans for CCNs, track 2, PII (including SSN and Canadian SIN), and phone #s"; - sp.info->scanner_version = "1.0"; - - // define the feature files this scanner creates - sp.info->feature_names.insert("ccn"); - sp.info->feature_names.insert("pii"); // personally identifiable information - sp.info->feature_names.insert("sin"); // canadian social insurance number - sp.info->feature_names.insert("ccn_track2"); - sp.info->feature_names.insert("telephone"); - sp.info->histogram_defs.insert(histogram_def("ccn", "", "histogram")); - sp.info->histogram_defs.insert(histogram_def("ccn_track2", "", "histogram")); - - // define the histograms to make - sp.info->histogram_defs.insert( - histogram_def("telephone", "", "histogram", HistogramMaker::FLAG_NUMERIC) - ); - - scan_ccns2_debug = sp.info->config->debug; // get debug value - } - - void Scanner::init(const scanner_params& sp) { - // - // patterns - // - - // FIXME: leading context - // FIXME: trailing context - /* #### #### #### #### --- most credit card numbers*/ - const string REGEX2("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]" + SDB + DB + DB + BLOCK + END); - - new Handler( - *this, - REGEX2, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ccnHitHandler - ); - - const string REGEX2_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])" + SDB + DB + DB + BLOCK + END); - - new Handler( - *this, - REGEX2_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ccnUTF16LEHitHandler - ); - - // FIXME: leading context - // FIXME: trailing context - /* 3### ###### ######### --- 15 digits beginning with 3 and funny space. */ - /* Must be american express... */ - const string REGEX3("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E]3[0-9]{3}" + DELIM + "[0-9]{6}" + DELIM + "[0-9]{5}" + END); - - new Handler( - *this, - REGEX3, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ccnHitHandler - ); - - const string REGEX3_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E]\\z00|[^\\z00])3[0-9]{3}" + DELIM + "[0-9]{6}" + DELIM + "[0-9]{5}" + END); - - new Handler( - *this, - REGEX3_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ccnUTF16LEHitHandler - ); - - // FIXME: leading context - // FIXME: trailing context - /* 3### ###### ######### --- 15 digits beginning with 3 and funny space. */ - /* Must be american express... */ - const string REGEX4("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E]3[0-9]{14}" + END); - - new Handler( - *this, - REGEX4, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ccnHitHandler - ); - - const string REGEX4_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E]\\z00|[^\\z00])3[0-9]{14}" + END); - - new Handler( - *this, - REGEX4_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ccnUTF16LEHitHandler - ); - - // FIXME: leading context - // FIXME: trailing context - /* ############### 13-19 numbers as a block beginning with a 4 or 5 - * followed by something that is not a digit. - * Yes, CCNs can now be up to 19 digits long. - * http://www.creditcards.com/credit-card-news/credit-card-appearance-1268.php - */ - const string REGEX5("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E][4-6][0-9]{15,18}" + END); - - new Handler( - *this, - REGEX5, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ccnHitHandler - ); - - const string REGEX5_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A\\z2E]\\z00|[^\\z00])[4-6][0-9]{15,18}" + END); - - new Handler( - *this, - REGEX5_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ccnUTF16LEHitHandler - ); - - // FIXME: leading context - /* ;###############=YYMM101#+? --- track2 credit card data */ - /* {SYEAR}{SMONTH} */ - /* ;CCN=05061010000000000738? */ - const string REGEX6("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A][4-6][0-9]{15,18}=" + SYEAR + SMONTH + "101[0-9]{13}"); - - new Handler( - *this, - REGEX6, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ccnTrack2HitHandler - ); - - const string REGEX6_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])[4-6][0-9]{15,18}=" + SYEAR + SMONTH + "101[0-9]{13}"); - - new Handler( - *this, - REGEX6_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ccnTrack2UTF16LEHitHandler - ); - - // FIXME: trailing context - // FIXME: leading context - /* US phone numbers without area code in parens */ - /* New addition: If proceeded by " ####? ####? " - * then do not consider this a phone number. We see a lot of that stuff in - * PDF files. - */ - const string REGEX7("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]([0-9]{3}" + TDEL + "){2}[0-9]{4}" + END); - - new Handler( - *this, - REGEX7, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::validatedTelephoneHitHandler - ); - - const string REGEX7_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])([0-9]{3}" + TDEL + "){2}[0-9]{4}" + END); - - new Handler( - *this, - REGEX7, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::validatedTelephoneUTF16LEHitHandler - ); - - // FIXME: trailing context - // FIXME: leading context - /* US phone number with parens, like (215) 555-1212 */ - const string REGEX8("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\([0-9]{3}\\)" + TDEL + "?[0-9]{3}" + TDEL + "[0-9]{4}" + END); - - new Handler( - *this, - REGEX8, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::telephoneTrailingCtxHitHandler - ); - - const string REGEX8_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])\\([0-9]{3}\\)" + TDEL + "?[0-9]{3}" + TDEL + "[0-9]{4}" + END); - - new Handler( - *this, - REGEX8_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::telephoneTrailingCtxUTF16LEHitHandler - ); - - // FIXME: trailing context - // FIXME: leading context - /* Generalized international phone numbers */ - const string REGEX9("[^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\+[0-9]{1,3}(" + TDEL + "[0-9]{2,3}){2,6}[0-9]{2,4}" + END); - - new Handler( - *this, - REGEX9, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::validatedTelephoneHitHandler - ); - - const string REGEX9_UTF16LE("([^\\z30-\\z39\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])\\+[0-9]{1,3}(" + TDEL + "[0-9]{2,3}){2,6}[0-9]{2,4}" + END); - - new Handler( - *this, - REGEX9, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::validatedTelephoneHitHandler - ); - - // FIXME: leading context - /* Generalized number with prefix */ - const string REGEX10(PHONETEXT_UTF8 + "[0-9/ .+]{7,18}"); - - new Handler( - *this, - REGEX10, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::telephoneHitHandler - ); - - const string REGEX10_UTF16LE(PHONETEXT_UTF16LE + "[0-9/ .+]{7,18}"); - - new Handler( - *this, - REGEX10_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::telephoneUTF16LEHitHandler - ); - - // FIXME: leading context - /* Generalized number with city code and prefix */ - const string REGEX11(PHONETEXT_UTF8 + "[0-9 +]+ ?\\([0-9]{2,4}\\) ?[\\-0-9]{4,8}"); - - new Handler( - *this, - REGEX11, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::telephoneHitHandler - ); - - const string REGEX11_UTF16LE(PHONETEXT_UTF16LE + "[0-9 +]+ ?\\([0-9]{2,4}\\) ?[\\-0-9]{4,8}"); - - new Handler( - *this, - REGEX11_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::telephoneUTF16LEHitHandler - ); - - // FIXME: trailing context - /* Generalized international phone numbers */ - const string REGEX12("fedex[^a-z]+([0-9]{4}[- ]?){2}[0-9]" + END); - - new Handler( - *this, - REGEX12, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::piiHitHandler - ); - - new Handler( - *this, - REGEX12, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::piiUTF16LEHitHandler - ); - - // FIXME: trailing context - const string REGEX13("ssn:?[ \\t]+[0-9]{3}-?[0-9]{2}-?[0-9]{4}" + END); - - new Handler( - *this, - REGEX13, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::piiHitHandler - ); - - new Handler( - *this, - REGEX13, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::piiUTF16LEHitHandler - ); - - const string REGEX14("dob:?[ \\t]+" + DATEFORMAT); - - new Handler( - *this, - REGEX14, - DefaultEncodings, - DefaultOptions, - &Scanner::dateHitHandler - ); - - // FIXME: trailing context - const string REGEX15("sin:?[ \\t]+[0-9]{3}[ -]?[0-9]{3}[ -]?[0-9]{3}" + END); - - new Handler( - *this, - REGEX15, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::sinHitHandler - ); - - new Handler( - *this, - REGEX15, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::sinUTF16LEHitHandler - ); - - const string REGEX16("[^0-9][0-9]{3}-[0-9]{3}-[0-9]{3}" + END); - - new Handler( - *this, - REGEX16, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::sinHitHandler2 - ); - - new Handler( - *this, - REGEX16, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::sinUTF16LEHitHandler2 - ); - - // FIXME: leading context - // FIXME: trailing context - /* Possible BitLocker Recovery Key. */ - const string BITLOCKER("[^\\z30-\\z39]([0-9]{6}-){7}[0-9]{6}[^\\z30-\\z39]"); - - new Handler( - *this, - BITLOCKER, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::bitlockerHitHandler - ); - - const string BITLOCKER_UTF16LE("([^\\z30-\\z39]\\z00|[^\\z00])([0-9]{6}-){7}[0-9]{6}[^\\z30-\\z39]"); - - new Handler( - *this, - BITLOCKER, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::bitlockerUTF16LEHitHandler - ); - } - - void Scanner::initScan(const scanner_params& sp) { - CCN_Recorder = sp.fs.named_feature_recorder("ccn"); - CCN_Track2_Recorder = sp.fs.named_feature_recorder("ccn_track2"); - Telephone_Recorder = sp.fs.named_feature_recorder("telephone"); - Alert_Recorder = sp.fs.get_alert_recorder(); - PII_Recorder = sp.fs.named_feature_recorder("pii"); - SIN_Recorder = sp.fs.named_feature_recorder("sin"); - } - - void Scanner::ccnHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + 1; - const size_t len = hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - pos; - - if (valid_ccn(reinterpret_cast(sp.sbuf.buf)+pos, len)) { - CCN_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::ccnUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + (*(sp.sbuf.buf+hit.Start+1) == '\0' ? 2 : 1); const size_t len = hit.End - pos; - - const string ascii(low_utf16le_to_ascii(sp.sbuf.buf+pos, len)); - if (valid_ccn(ascii.c_str(), ascii.size())) { - CCN_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::ccnTrack2HitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + 1; - const size_t len = hit.End - pos; - - if (valid_ccn(reinterpret_cast(sp.sbuf.buf)+pos, len)) { - CCN_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::ccnTrack2UTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + (*(sp.sbuf.buf+hit.Start+1) == '\0' ? 2 : 1); - const size_t len = hit.End - pos; - - const string ascii(low_utf16le_to_ascii(sp.sbuf.buf+pos, len)); - if (valid_ccn(ascii.c_str(), ascii.size())) { - CCN_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::telephoneHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Telephone_Recorder->write_buf(sp.sbuf, hit.Start+1, hit.End-hit.Start-1); - } - - void Scanner::telephoneUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t start = hit.Start + (*(sp.sbuf.buf + hit.Start + 1) == '\0' ? 2 : 1); - const size_t len = hit.End - start; - - Telephone_Recorder->write_buf(sp.sbuf, start, len); - } - - void Scanner::telephoneTrailingCtxHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Telephone_Recorder->write_buf( - sp.sbuf, - hit.Start+1, - hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - (hit.Start+1) - ); - } - - void Scanner::telephoneTrailingCtxUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Telephone_Recorder->write_buf( - sp.sbuf, - hit.Start+1, - hit.End - (*(sp.sbuf.buf+hit.End-3) == '.' ? 3 : 1) -(hit.Start+1) - ); - } - - void Scanner::validatedTelephoneHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + 1; - const size_t len = hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - pos; - if (valid_phone(sp.sbuf, pos, len)){ - if (!is_pdf_box(sp.sbuf, pos)) { - Telephone_Recorder->write_buf(sp.sbuf, pos, len); - } - } - } - - void Scanner::validatedTelephoneUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + 1; - const size_t len = hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - pos; - if (valid_phone_utf16le(sp.sbuf, pos, len)){ - Telephone_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::bitlockerHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Alert_Recorder->write(sp.sbuf.pos0 + hit.Start + 1, reinterpret_cast(sp.sbuf.buf) + 1, "Possible BitLocker Recovery Key (ASCII)."); - } - - void Scanner::bitlockerUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + (*(sp.sbuf.buf + hit.Start + 1) == '\0' ? 2 : 1); - const size_t len = (hit.End - 1) - pos; - - Alert_Recorder->write(sp.sbuf.pos0 + pos, low_utf16le_to_ascii(sp.sbuf.buf + pos, len), "Possible BitLocker Recovery Key (UTF-16)."); - } - - void Scanner::piiHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - PII_Recorder->write_buf( - sp.sbuf, hit.Start, - hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - hit.Start - ); - } - - void Scanner::piiUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - PII_Recorder->write_buf( - sp.sbuf, hit.Start, - hit.End - (*(sp.sbuf.buf+hit.End-3) == '.' ? 3 : 1) - hit.Start - ); - } - - void Scanner::sinHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - SIN_Recorder->write_buf( - sp.sbuf, hit.Start, - hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - hit.Start - ); - } - - void Scanner::sinUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - SIN_Recorder->write_buf( - sp.sbuf, hit.Start, - hit.End - (*(sp.sbuf.buf+hit.End-3) == '.' ? 3 : 1) - hit.Start - ); - } - - void Scanner::sinHitHandler2(const LG_SearchHit& hit, const scanner_params& sp) { - SIN_Recorder->write_buf( - sp.sbuf, hit.Start+1, - hit.End - (*(sp.sbuf.buf+hit.End-2) == '.' ? 2 : 1) - hit.Start - ); - } - - void Scanner::sinUTF16LEHitHandler2(const LG_SearchHit& hit, const scanner_params& sp) { - SIN_Recorder->write_buf( - sp.sbuf, hit.Start+1, - hit.End - (*(sp.sbuf.buf+hit.End-3) == '.' ? 3 : 1) - hit.Start - ); - } - - void Scanner::dateHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - PII_Recorder->write_buf(sp.sbuf, hit.Start, hit.End - hit.Start); - } - - Scanner TheScanner; -} - -extern "C" -void scan_accts_lg(struct scanner_params &sp) { - scan_lg(accts::TheScanner, sp, rcb); -} - -#endif // HAVE_LIBLIGHTGREP diff --git a/src/scan_base16_lg.cpp b/src/scan_base16_lg.cpp deleted file mode 100644 index 75ce4a9d..00000000 --- a/src/scan_base16_lg.cpp +++ /dev/null @@ -1,220 +0,0 @@ -#include "config.h" - -// if liblightgrep isn't present, compiles to nothing -#ifdef HAVE_LIBLIGHTGREP - -#include - -#include "be20_api/scanner_params.h" -#include "histogram.h" -#include "pattern_scanner.h" - -namespace base16 { -// const char* const DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; - const char* const DefaultEncodingsCStrings[] = {"UTF-8"}; - - const vector DefaultEncodings( - DefaultEncodingsCStrings, - DefaultEncodingsCStrings + - sizeof(DefaultEncodingsCStrings)/sizeof(DefaultEncodingsCStrings[0]) - ); - - const LG_KeyOptions DefaultOptions = { 0, 1 }; // patterns, case-insensitive - - // - // the scanner - // - - class Scanner: public PatternScanner { - public: - Scanner(): PatternScanner("base16_lg"), Recorder(0) {} - virtual ~Scanner() {} - - virtual Scanner* clone() const { return new Scanner(*this); } - - virtual void startup(const scanner_params& sp); - virtual void init(const scanner_params& sp); - virtual void initScan(const scanner_params&); - - feature_recorder &Recorder; - - void hitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - decode(sp.sbuf, hit.Start, hit.End - hit.Start, sp, rcb); - } - - private: - Scanner(const Scanner& s): - PatternScanner(s), - Recorder(s.Recorder) - {} - - Scanner& operator=(const Scanner&); - - void decode(const sbuf_t& osbuf, size_t pos, size_t len, const scanner_params& sp); - }; - - const uint16_t BASE16_LSN[256] = { - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 256, 256, 256, 256, 256, 256, - 256, 10, 11, 12, 13, 14, 15, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 10, 11, 12, 13, 14, 15, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256 - }; - - const uint16_t BASE16_MSN[256] = { - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 0, 16, 32, 48, 64, 80, 96, 112, - 128, 144, 256, 256, 256, 256, 256, 256, - 256, 160, 176, 192, 208, 224, 240, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 160, 176, 192, 208, 224, 240, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 556, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256, - 256, 256, 256, 256, 256, 256, 256, 256 - }; - - void Scanner::startup(const scanner_params& sp) { - sp.check_version(); - - sp.info.set_name("base16_lg"); - sp.info->name = "base16_lg"; - sp.info->author = "Simson L. Garfinkel"; - sp.info->description = "Base16 (hex) scanner"; - sp.info->scanner_version = "1.0"; - sp.info->flags = scanner_info::SCANNER_RECURSE; - sp.info->feature_names.insert("hex"); // notable hex values - } - - void Scanner::init(const scanner_params& sp) { - // - // patterns - // - - /* - * a hex string - * {0,2} means we have 0-2 space characters - * {6,} means minimum of 6 hex bytes - */ - const std::string HEX("[0-9A-F]{2}(([ \\n]|\\r\\n){0,2}[0-9A-F]{2}){5,}"); - - new Handler( - *this, HEX, DefaultEncodings, DefaultOptions, &Scanner::hitHandler - ); - } - - void Scanner::initScan(const scanner_params& sp) { - Recorder = sp.fs.named_feature_recorder("hex"); - } - - // Don't re-analyze hex bufs smaller than this - const unsigned int opt_min_hex_buf = 64; - - size_t base16_decode_skipping_invalid(uint8_t* dst_start, const uint8_t* src, const uint8_t* src_end) { - uint8_t* dst = dst_start; - uint16_t byte; - uint8_t msn, lsn; - - while (src < src_end) { - msn = *src++; - lsn = *src++; - byte = BASE16_MSN[msn] | BASE16_LSN[lsn]; - if (byte < 0x100) { - *dst++ = static_cast(byte); - } - else { - // A "byte" value over FF means we've hit something invalid. The - // pattern requires that hex digits come in pairs, so the first - // character is invalid. Just advance one byte (== backing up one - // byte now, since we've already gone ahead two bytes). - --src; - } - } - - return dst - dst_start; - } - - void Scanner::decode(const sbuf_t& osbuf, size_t pos, size_t len, const scanner_params& sp) { - sbuf_t sbuf(osbuf, pos, len); // the substring we are working with - - TODO: Replace managed_malloc with a sbuf_t::sbuf_malloc - managed_malloc b(sbuf.pagesize/2); - if (b.buf == 0) return; - - const size_t p = base16_decode_skipping_invalid( - b.buf, sbuf.buf, sbuf.buf+sbuf.pagesize - ); - - // Alert on byte sequences of 48, 128 or 256 bits - if (p == 48/8 || p == 128/8 || p == 256/8) { - // it validates; write original with context - Recorder->write_buf(osbuf, pos, len); - return; // Small keys don't get recursively analyzed - } - - if (p > opt_min_hex_buf) { - // NB: we manually add BASE16 here when recursing, because - // rcb.partName is LIGHTGREP here, which is not useful. - sbuf_t nsbuf(osbuf.pos0 + pos + "BASE16", b.buf, p, p, false); - (*rcb.callback)(scanner_params(sp, nsbuf)); // recurse - } - } - - Scanner TheScanner; -} - -extern "C" -void scan_base16_lg(struct scanner_params &sp) { - scan_lg(base16::TheScanner, sp, rcb); -} - -#endif // HAVE_LIBLIGHTGREP diff --git a/src/scan_email_lg.cpp b/src/scan_email_lg.cpp deleted file mode 100644 index 84db124f..00000000 --- a/src/scan_email_lg.cpp +++ /dev/null @@ -1,511 +0,0 @@ -#include "config.h" - -// if liblightgrep isn't present, compiles to nothing -#ifdef HAVE_LIBLIGHTGREP - -#include -#include -#include - -#include "be20_api/scanner_params.h" - -#include "histogram.h" -#include "pattern_scanner.h" -#include "pattern_scanner_utils.h" -#include "utils.h" // needs config.h - -using namespace std; - -namespace email { - const char* const DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; - - const vector DefaultEncodings( - DefaultEncodingsCStrings, - DefaultEncodingsCStrings + - sizeof(DefaultEncodingsCStrings)/sizeof(DefaultEncodingsCStrings[0]) - ); - - const vector OnlyUTF8Encoding(1, "UTF-8"); - - const vector OnlyUTF16LEEncoding(1, "UTF-16LE"); - - const LG_KeyOptions DefaultOptions = { 0, 1 }; // patterns, case-insensitive - - // - // subpatterns - // - - const string INUM("(1?[0-9]{1,2}|2([0-4][0-9]|5[0-5]))"); - const string HEX("[0-9a-f]"); - const string ALNUM("[a-zA-Z0-9]"); - - const string PC("[\\x20-\\x7E]"); - - const string TLD("(AC|AD|AE|AERO|AF|AG|AI|AL|AM|AN|AO|AQ|AR|ARPA|AS|ASIA|AT|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BIZ|BJ|BL|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CAT|CC|CD|CF|CG|CH|CI|CK|CL|CM|CN|CO|COM|COOP|CR|CU|CV|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EDU|EE|EG|EH|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE|GF|GG|GH|GI|GL|GM|GN|GOV|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|INFO|INT|IO|IQ|IR|IS|IT|JE|JM|JO|JOBS|JP|KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MF|MG|MH|MIL|MK|ML|MM|MN|MO|MOBI|MP|MQ|MR|MS|MT|MU|MUSEUM|MV|MW|MX|MY|MZ|NA|NAME|NC|NE|NET|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|ORG|PA|PE|PF|PG|PH|PK|PL|PM|PN|PR|PRO|PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SY|SZ|TC|TD|TEL|TF|TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TRAVEL|TT|TV|TW|TZ|UA|UG|UK|UM|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|YE|YT|YU|ZA|ZM|ZW)"); - - const string YEAR("(19[6-9][0-9]|20[0-1][0-9])"); - const string DAYOFWEEK("(Mon|Tue|Wed|Thu|Fri|Sat|Sun)"); - const string MONTH("(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"); - const string ABBREV("(UTC?|GMT|EST|EDT|CST|CDT|MST|MDT|PST|PDT|[ZAMNY])"); - - // - // helper functions - // - - // NB: It is very important *not* to use functions expecting C strings - // or std::strings on hit data, as hit data could contain internal null - // bytes. - - /** return the offset of the domain in an email address. - * returns buflen + 1 if the domain is not found. - * the domain extends to the end of the email address - */ - inline size_t find_domain_in_email(const uint8_t* buf, size_t buflen) { - return find(buf, buf + buflen, '@') - buf + 1; - } - - template - inline size_t find_domain_in_url(const T* buf, size_t buflen, size_t& domain_len) { - const T* dbeg = search_n(buf, buf + buflen, 2, '/') + 2; - if (dbeg < buf + buflen) { - const T stop[] = { '/', ':' }; - const T* dend = find_first_of(dbeg, buf + buflen, stop, stop + 2); - domain_len = dend - dbeg; - return dbeg - buf; - } - - return buflen; - } - - bool valid_ether_addr(const uint8_t* buf) { - if (memcmp((const uint8_t *)"00:00:00:00:00:00", buf, 17) == 0) { - return false; - } - - if (memcmp((const uint8_t *)"00:11:22:33:44:55", buf, 17) == 0) { - return false; - } - - /* Perform a quick histogram analysis. - * For each group of characters, create a value based on the two digits. - * There is no need to convert them to their 'actual' value. - * Don't accept a histogram that has 3 values. That could be - * 11:11:11:11:22:33 - * Require 4, 5 or 6. - * If we have 4 or more distinct values, then treat it good. - * Otherwise its is some pattern we don't want. - */ - set ctr; - for (uint32_t i = 0; i < 6; ++i) { // loop over each group - // create a unique value of the two characters - ctr.insert((buf[i*3] << 8) + buf[i*3+1]); - } - - return ctr.size() >= 4; - } - - template - bool valid_ipaddr(const T* leftguard, const T* hit) { - // copy up to 'window' preceding Ts into context array - static const ssize_t window = 8; - T context[window] = { ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' }; - const ssize_t diff = min(hit - leftguard, window); - copy(hit - diff, hit, context + window - diff); - - if ( - isalnum(context[7]) || - context[7] == '.' || - context[7] == '-' || - context[7] == '+' || - (ishexnumber(context[4]) && ishexnumber(context[5]) && - ishexnumber(context[6]) && context[7] == '}') || - (*hit == '0' && *(hit + 1) == '.')) - { - // ignore - return false; - } - - static const struct { - size_t pos; - const char* str; - } checks[] = { - { 5, "v." }, - { 5, "v " }, - { 5, "rv:" }, // rv:1.9.2.8 as in Mozilla - { 4, ">=" }, // >= 1.8.0.10 - { 4, "<=" }, // <= 1.8.0.10 - { 4, "<<" }, // << 1.8.0.10 - { 4, "ver" }, - { 4, "Ver" }, - { 4, "VER" }, - { 0, "rsion" }, - { 0, "ion=" }, - { 0, "PSW/" }, // PWS/1.5.19.3 ... - { 0, "flash=" }, // flash= - { 0, "stone=" }, // Milestone= - { 4, "NSS" }, - { 0, "/2001," }, // /2001,3.60.50.8 - { 0, "TI_SZ" } // %REG_MULTI_SZ%, - }; - - for (size_t i = 0; i < sizeof(checks)/sizeof(checks[0]); ++i) { - if (search( - context + checks[i].pos, - context + 8, checks[i].str, - checks[i].str + strlen(checks[i].str) - ) != context + 8) { - return false; - } - } - - return true; - } - - // - // the scanner - // - - class Scanner: public PatternScanner { - public: - Scanner(): PatternScanner("email_lg"), RFC822_Recorder(0), Email_Recorder(0), Domain_Recorder(0), Ether_Recorder(0), URL_Recorder(0) {} - virtual ~Scanner() {} - - virtual Scanner* clone() const { return new Scanner(*this); } - - virtual void startup(const scanner_params& sp); - virtual void init(const scanner_params& sp); - virtual void initScan(const scanner_params& sp); - - feature_recorder* RFC822_Recorder; - feature_recorder* Email_Recorder; - feature_recorder* Domain_Recorder; - feature_recorder* Ether_Recorder; - feature_recorder* URL_Recorder; - - void rfc822HitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void emailHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void emailUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void ipaddrHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void ipaddrUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void etherHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void etherUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void protoHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void protoUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp); - - private: - Scanner(const Scanner& s): - PatternScanner(s), - RFC822_Recorder(s.RFC822_Recorder), - Email_Recorder(s.Email_Recorder), - Domain_Recorder(s.Domain_Recorder), - Ether_Recorder(s.Ether_Recorder), - URL_Recorder(s.URL_Recorder) - {} - - Scanner& operator=(const Scanner&); - }; - - void Scanner::startup(const scanner_params& sp) { - sp.check_version(); - - sp.info->name = "email_lg"; - sp.info->author = "Simson L. Garfinkel"; - sp.info->description = "Scans for email addresses, domains, URLs, RFC822 headers, etc."; - sp.info->scanner_version = "1.0"; - - // define the feature files this scanner creates - sp.info->feature_names.insert("email"); - sp.info->feature_names.insert("domain"); - sp.info->feature_names.insert("url"); - sp.info->feature_names.insert("rfc822"); - sp.info->feature_names.insert("ether"); - - // define the histograms to make - sp.info->histogram_defs.insert(histogram_def("email", "", "histogram", HistogramMaker::FLAG_LOWERCASE)); - sp.info->histogram_defs.insert(histogram_def("domain", "", "histogram")); - sp.info->histogram_defs.insert(histogram_def("url", "", "histogram")); - sp.info->histogram_defs.insert(histogram_def("url", "://([^/]+)", "services")); - sp.info->histogram_defs.insert(histogram_def("url", "://((cid-[0-9a-f])+[a-z.].live.com/)", "microsoft-live")); - sp.info->histogram_defs.insert(histogram_def("url", "://[-_a-z0-9.]+facebook.com/.*[&?]{1}id=([0-9]+)", "facebook-id")); - sp.info->histogram_defs.insert(histogram_def("url", "://[-_a-z0-9.]+facebook.com/([a-zA-Z0-9.]*[^/?&]$)", "facebook-address", HistogramMaker::FLAG_LOWERCASE)); - sp.info->histogram_defs.insert(histogram_def("url", "search.*[?&/;fF][pq]=([^&/]+)", "searches")); - } - - void Scanner::init(const scanner_params& sp) { - // - // patterns - // - - const string DATE(DAYOFWEEK + ",[ \\t\\n\\r]+[0-9]{1,2}[ \\t\\n\\r]+" + MONTH + "[ \\t\\n\\r]+" + YEAR + "[ \\t\\n\\r]+[0-2][0-9]:[0-5][0-9]:[0-5][0-9][ \\t\\n\\r]+([+-][0-2][0-9][0314][05]|" + ABBREV + ")"); - - new Handler( - *this, - DATE, - DefaultEncodings, - DefaultOptions, - &Scanner::rfc822HitHandler - ); - - const string MESSAGE_ID("Message-ID:([ \\t\\n]|\\r\\n)?<" + PC + "+>"); - - new Handler( - *this, - MESSAGE_ID, - DefaultEncodings, - DefaultOptions, - &Scanner::rfc822HitHandler - ); - - const string SUBJECT("Subject:[ \\t]?" + PC + "+"); - - new Handler( - *this, - SUBJECT, - DefaultEncodings, - DefaultOptions, - &Scanner::rfc822HitHandler - ); - - const string COOKIE("Cookie:[ \\t]?" + PC + "+"); - - new Handler( - *this, - COOKIE, - DefaultEncodings, - DefaultOptions, - &Scanner::rfc822HitHandler - ); - - const string HOST("Host:[ \\t]?[a-zA-Z0-9._]+"); - - new Handler( - *this, - HOST, - DefaultEncodings, - DefaultOptions, - &Scanner::rfc822HitHandler - ); - - // FIXME: trailing context -// const string EMAIL(ALNUM + "[a-zA-Z0-9._%\\-+]+" + ALNUM + "@" + ALNUM + "[a-zA-Z0-9._%\\-]+\\." + TLD + "[^\\z41-\\z5A\\z61-\\z7A]"); - const string EMAIL(ALNUM + "(\\.?[a-zA-Z0-9_%\\-+])+\\.?" + ALNUM + "@" + ALNUM + "(\\.?[a-zA-Z0-9_%\\-])+\\." + TLD + "[^\\z41-\\z5A\\z61-\\z7A]"); - - new Handler( - *this, - EMAIL, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::emailHitHandler - ); - - new Handler( - *this, - EMAIL, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::emailUTF16LEHitHandler - ); - - // FIXME: leading context - // FIXME: trailing context - // Numeric IP addresses. Get the context before and throw away some things - const string IP("[^\\z30-\\z39\\z2E]" + INUM + "(\\." + INUM + "){3}[^\\z30-\\z39\\z2B\\z2D\\z2E\\z41-\\z5A\\z5F\\z61-\\z7A]"); - - new Handler( - *this, - IP, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::ipaddrHitHandler - ); - - const string IP_UTF16LE("([^\\z30-\\z39\\z2E]\\z00|[^\\z00])" + INUM + "(\\." + INUM + "){3}[^\\z30-\\z39\\z2B\\z2D\\z2E\\z41-\\z5A\\z5F\\z61-\\z7A]"); - - new Handler( - *this, - IP_UTF16LE, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::ipaddrUTF16LEHitHandler - ); - - // FIXME: leading context - // FIXME: trailing context - // found a possible MAC address! - const string MAC("[^\\z30-\\z39\\z3A\\z41-\\z5A\\z61-\\z7A]" + HEX + "{2}(:" + HEX + "{2}){5}[^\\z30-\\z39\\z3A\\z41-\\z5A\\z61-\\z7A]"); - - new Handler( - *this, - MAC, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::etherHitHandler - ); - - const string MAC_UTF16LE("([^\\z30-\\z39\\z3A\\z41-\\z5A\\z61-\\z7A]\\z00|[^\\z00])" + HEX + "{2}(:" + HEX + "{2}){5}[^\\z30-\\z39\\z3A\\z41-\\z5A\\z61-\\z7A]"); - - new Handler( - *this, - MAC, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::etherUTF16LEHitHandler - ); - - const string PROTO("(https?|afp|smb)://[a-zA-Z0-9_%/\\-+@:=&?#~.;]+"); - - new Handler( - *this, - PROTO, - OnlyUTF8Encoding, - DefaultOptions, - &Scanner::protoHitHandler - ); - - new Handler( - *this, - PROTO, - OnlyUTF16LEEncoding, - DefaultOptions, - &Scanner::protoUTF16LEHitHandler - ); - } - - void Scanner::initScan(const scanner_params& sp) { - RFC822_Recorder = sp.named_feature_recorder("rfc822"); - Email_Recorder = sp.named_feature_recorder("email"); - Domain_Recorder = sp.named_feature_recorder("domain"); - Ether_Recorder = sp.named_feature_recorder("ether"); - URL_Recorder = sp.named_feature_recorder("url"); - } - - void Scanner::rfc822HitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - RFC822_Recorder->write_buf(sp.sbuf, hit.Start, hit.End - hit.Start); - } - - void Scanner::emailHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t len = (hit.End - 1) - hit.Start; - const uint8_t* matchStart = sp.sbuf.buf + hit.Start; - - Email_Recorder->write_buf(sp.sbuf, hit.Start, len); - const size_t domain_off = find_domain_in_email(matchStart, len); - if (domain_off < len) { - Domain_Recorder->write_buf(sp.sbuf, hit.Start + domain_off, len - domain_off); - } - } - - void Scanner::emailUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t len = (hit.End - 1) - hit.Start; - const uint8_t* matchStart = sp.sbuf.buf + hit.Start; - - Email_Recorder->write_buf(sp.sbuf, hit.Start, len); - const size_t domain_off = find_domain_in_email(matchStart, len) + 1; - if (domain_off < len) { - Domain_Recorder->write_buf(sp.sbuf, hit.Start + domain_off, len - domain_off); - } - } - - void Scanner::ipaddrHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - if (valid_ipaddr(sp.sbuf.buf, sp.sbuf.buf + hit.Start + 1)) { - Domain_Recorder->write_buf(sp.sbuf, hit.Start+1, hit.End - hit.Start - 2); - } - } - - void Scanner::ipaddrUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + (*(sp.sbuf.buf+hit.Start+1) == '\0' ? 2 : 1); - const size_t len = (hit.End - 1) - pos; - // this assumes sp.sbuf.pos will never be an odd memory address... - // if pos is odd, add 1 to sbuf.buf and use it as a leftmost guard - const uint16_t* leftguard(reinterpret_cast(sp.sbuf.buf + ((pos & 0x01) == 1 ? 1: 0))); - if (valid_ipaddr(leftguard, reinterpret_cast(sp.sbuf.buf + pos))) { - Domain_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::etherHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + 1; - const size_t len = (hit.End - 1) - pos; - if (valid_ether_addr(sp.sbuf.buf+pos)){ - Ether_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::etherUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const size_t pos = hit.Start + (*(sp.sbuf.buf+hit.Start+1) == '\0' ? 2 : 1); - const size_t len = (hit.End -1) - pos; - - const string ascii(low_utf16le_to_ascii(sp.sbuf.buf+pos, len)); - if (valid_ether_addr(reinterpret_cast(ascii.c_str()))){ - Ether_Recorder->write_buf(sp.sbuf, pos, len); - } - } - - void Scanner::protoHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - // for reasons that aren't clear, there are a lot of net protocols that - // have an http://domain in them followed by numbers. So this counts the - // number of slashes and if it is only 2 the size is pruned until the - // last character is a letter - const int slash_count = count( - sp.sbuf.buf + hit.Start, - sp.sbuf.buf + hit.End, '/' - ); - - size_t len = hit.End - hit.Start; - - if (slash_count == 2) { - while (len > 0 && !isalpha(sp.sbuf[hit.Start+len-1])) { - --len; - } - } - - URL_Recorder->write_buf(sp.sbuf, hit.Start, len); - - size_t domain_len = 0; - size_t domain_off = find_domain_in_url(sp.sbuf.buf + hit.Start, len, domain_len); // find the start of domain? - if (domain_off < len && domain_len > 0) { - Domain_Recorder->write_buf(sp.sbuf, hit.Start + domain_off, domain_len); - } - } - - void Scanner::protoUTF16LEHitHandler(const LG_SearchHit& hit, const scanner_params& sp) { - const int slash_count = count( - sp.sbuf.buf + hit.Start, - sp.sbuf.buf + hit.End, '/' - ); - - size_t len = hit.End - hit.Start; - - if (slash_count == 2) { - while (len > 1 && !isalpha(sp.sbuf[hit.Start+len-2])) { - len -= 2; - } - } - - URL_Recorder->write_buf(sp.sbuf, hit.Start, len); - - size_t domain_len = 0; - size_t domain_off = find_domain_in_url(reinterpret_cast(sp.sbuf.buf + hit.Start), len/2, domain_len); // find the start of domain? - domain_off *= 2; - domain_len *= 2; - if (domain_off < len && domain_len > 0) { - Domain_Recorder->write_buf(sp.sbuf, hit.Start + domain_off, domain_len); - } - } - - Scanner TheScanner; -} - -extern "C" -void scan_email_lg(struct scanner_params &sp) { - scan_lg(email::TheScanner, sp, rcb); -} - -#endif // HAVE_LIBLIGHTGREP diff --git a/src/scan_gps_lg.cpp b/src/scan_gps_lg.cpp deleted file mode 100644 index 3bf96af2..00000000 --- a/src/scan_gps_lg.cpp +++ /dev/null @@ -1,212 +0,0 @@ -#include "config.h" - -// if liblightgrep isn't present, compiles to nothing -#ifdef HAVE_LIBLIGHTGREP - -#include - -#include "be20_api/scanner_params.h" - -#include "pattern_scanner.h" - -namespace gps { - const char* const DefaultEncodingsCStrings[] = {"UTF-8", "UTF-16LE"}; - - const vector DefaultEncodings( - DefaultEncodingsCStrings, - DefaultEncodingsCStrings + - sizeof(DefaultEncodingsCStrings)/sizeof(DefaultEncodingsCStrings[0]) - ); - - const LG_KeyOptions DefaultOptions = { 0, 0 }; // patterns, case-sensitive - - // - // helper functions - // - - /** - * Return NNN in - */ - string get_quoted_attrib(string text, string attrib) { - const size_t pos = text.find(attrib); - if (pos == string::npos) return ""; /* no attrib */ - const ssize_t quote1 = text.find('"', pos); - if (quote1 < 0) return ""; /* no opening quote */ - const ssize_t quote2 = text.find('"', quote1+1); - if (quote2 < 0) return ""; /* no closing quote */ - return text.substr(quote1+1, quote2-(quote1+1)); - } - - /** - * Return NNN in NNN - */ - string get_cdata(string text) { - const ssize_t gt = text.find('>'); - if (gt < 0) return ""; /* no > */ - const ssize_t lt = text.find('<', gt+1); - if (lt < 0) return ""; /* no < */ - return text.substr(gt+1, lt-(gt+1)); - } - - // - // subpatterns - // - - const string LATLON("(-?[0-9]{1,3}\\.[0-9]{6,8})"); - const string ELEV("(-?[0-9]{1,6}\\.[0-9]{0,3})"); - - // - // the scanner - // - - class Scanner: public PatternScanner { - public: - Scanner(): PatternScanner("gps_lg"), Recorder(0), Lat(), Lon(), Ele(), Time(), Speed(), Course() {} - virtual ~Scanner() {} - - virtual Scanner* clone() const { return new Scanner(*this); } - - virtual void startup(const scanner_params& sp); - virtual void init(const scanner_params& sp); - virtual void initScan(const scanner_params&); - - feature_recorder* Recorder {}; - - void trkptHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void eleHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void timeHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void speedHandler(const LG_SearchHit& hit, const scanner_params& sp); - - void courseHandler(const LG_SearchHit& hit, const scanner_params& sp); - - private: - Scanner(const Scanner& s): PatternScanner(s), Recorder(s.Recorder), Lat(s.Lat), Lon(s.Lon), Ele(s.Ele), Time(s.Time), Speed(s.Speed), Course(s.Course) {} - Scanner& operator=(const Scanner&); - - void clear(const scanner_params& sp, size_t pos); - - string Lat, Lon, Ele, Time, Speed, Course; - }; - - void Scanner::startup(const scanner_params& sp) { - sp.check_version(); - - sp.info->name = "gps_lg"; - sp.info->author = "Simson L. Garfinkel"; - sp.info->description = "Garmin Trackpt XML info"; - sp.info->scanner_version = "1.0"; - sp.info->feature_defs.push_back( feature_recorder_def("gps")); - } - - void Scanner::init(const scanner_params& sp) { - // - // patterns - // - - const string TRKPT("" + ELEV + ""); - - new Handler( - *this, - ELE, - DefaultEncodings, - DefaultOptions, - &Scanner::eleHandler - ); - - const string TIME(""); - - new Handler( - *this, - TIME, - DefaultEncodings, - DefaultOptions, - &Scanner::timeHandler - ); - - const string GPXTPX_SPEED("" + ELEV + ""); - - new Handler( - *this, - GPXTPX_SPEED, - DefaultEncodings, - DefaultOptions, - &Scanner::speedHandler - ); - - const string GPXTPX_COURSE("" + ELEV + ""); - - new Handler( - *this, - GPXTPX_COURSE, - DefaultEncodings, - DefaultOptions, - &Scanner::courseHandler - ); - } - - void Scanner::initScan(const scanner_params& sp) { - Recorder = &sp.named_feature_recorder("gps"); - } - - void Scanner::clear(const scanner_params& sp, size_t pos) { - // dump the current and go to the next - if (!Time.empty() || !Lat.empty() || !Lon.empty() || - !Ele.empty() || !Speed.empty() || !Course.empty()) { - const string what = Time + "," + Lat + "," + Lon + "," + - Ele + "," + Speed + "," + Course; - // NB: the pos is the *end* of the "hit" - Recorder->write(sp.sbuf.pos0 + pos, what, ""); - - Time.clear(); - Lat.clear(); - Lon.clear(); - Ele.clear(); - Speed.clear(); - Course.clear(); - } - } - - void Scanner::trkptHandler(const LG_SearchHit& hit, const scanner_params& sp) { - clear(sp, hit.Start); - Lat = get_quoted_attrib(reinterpret_cast(sp.sbuf.buf), "lat"); - Lon = get_quoted_attrib(reinterpret_cast(sp.sbuf.buf), "lon"); - } - - void Scanner::eleHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Ele = get_cdata(reinterpret_cast(sp.sbuf.buf)); - } - - void Scanner::timeHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Time = get_cdata(reinterpret_cast(sp.sbuf.buf)); - } - - void Scanner::speedHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Speed = get_cdata(reinterpret_cast(sp.sbuf.buf)); - } - - void Scanner::courseHandler(const LG_SearchHit& hit, const scanner_params& sp) { - Course = get_cdata(reinterpret_cast(sp.sbuf.buf)); - } - - Scanner TheScanner; -} - -extern "C" -void scan_gps_lg(scanner_params &sp) { - scan_lg(gps::TheScanner, sp, rcb); -} - -#endif // HAVE_LIBLIGHTGREP diff --git a/src/scan_httpheader_lg.cpp b/src/scan_httpheader_lg.cpp deleted file mode 100644 index 4b0d37fc..00000000 --- a/src/scan_httpheader_lg.cpp +++ /dev/null @@ -1,93 +0,0 @@ -#include - -namespace httpheader { - // - // subpatterns - // - - const std::string PC("[\\x20-\\x7E]"); - - const std::string XPC("[\\x20-\\x7E--\"]"); - - /* - * RFC 2616, Page 12 - */ - /* Account for over-zealously translated line breaks */ - /* HTTP_LWS - Linear White Space (new line and a whitespace character) */ - const std::string HTTP_CRLF("(\\x0D?\\x0A)"); - const std::string HTTP_LWS(HTTP_CRLF + "[ \\t]"); - - /* - * Keeping it simple: no HTTP_CTEXT, HTTP_QUOTED_PAIR, or keeping count - * of parentheses. The distinguishing part of COMMENTs is they are - * allowed to have line breaks, if followed by whitespace. - * - * TODO Might still need to account for RFC 2407. - */ - const std::string HTTP_COMMENT("(" + PC + "|" + HTTP_LWS + "|\\t)"); - - // - // patterns - // - - /* - * RFC 2616, Sections 14.38 and 14.43 - * These fields are allowed multi-line values (comments). - * - * For some reason, specifying the field value as: - * ({XPC}|{HTTP_LWS})+ - * causes the NFA rule set to explode to >32000 rules, making flex refuse - * to compile. - */ - const std::string SERVER_OR_UA("(Server|User-Agent):[ \\t]?" + PC + "{1,80}"); - - /* - * RFC 2616, Section 14.23 - */ - const std::string HOST("Host:[ \\t]?[a-zA-Z0-9._:]{1,256}"); - - /* - * These headers have a general set of characters allowed in their field - * value, including double-quote. - * - * Keep-Alive is defined in RFC 2068, Section 19.7.1.1. Allowable tokens - * seem to include doublequote, per "value" definition in RFC 2068, - * Section 3.7. - * - * Authorization, Proxy-Authenticate, Proxy-Authorization and WWW- - * Authenticate are defined in RFC 2617, not yet reviewed. Assuming PC - * character set allowed for now. - * - * Content-Location, Location and Referer (RFC 2616, Sections 14.14, 14.30 - * and 14.36) have a URI as the field value. - * SLG: Limited to 80 characters - */ - const std::string HEADERS_1("(Accept|Accept-Ranges|Authorization|Cache-Control|Content-Location|Etag|Expect|Keep-Alive|If-Match|If-None-Match|If-Range|Pragma|Proxy-Authenticate|Proxy-Authorization|Referer|TE|Transfer-Encoding|Warning|WWW-Authenticate):[ \\t]?" + PC + "{1,80}"); - - /* - * These headers have a general set of characters allowed in their field - * value, excluding double-quote. - * - * Date and If-Modified-Since reference RFCs 1123 and 850 (RFC 2616 - * Section 3.3.1), not yet reviewed. - * Double-quotes are assumed excluded. - * - * Set-Cookie: RFC 6265, Section 4.1.1, Page 9 - * This header field is allowed to be sent multiple times in the same - * header. - * - * Cookie: RFC 6265, Section 4.2.1, Page 13 - * The cookie length does not seem to have a limit, but cookie stores should - * be able to store at least 4096 bytes for a cookie [RFC 6265, Section 6.1]. - * - * From: should contain an email address. - */ - const std::string HEADERS_2("(Accept-Charset|Accept-Encoding|Accept-Language|Age|Allow|Connection|Content-Encoding|Content-Language|Content-MD5|Content-Range|Content-Type|Cookie|Date|From|If-Modified-Since|If-Unmodified-Since|Last-Modified|Range|Retry-After|Set-Cookie|Trailer|Upgrade|Vary):[ \\t]?" + XPC + "{1,80}"); - - const std::string VIA("Via:[ \\t]?" + HTTP_COMMENT + "{1,256}"); - - /* - * RFC 2616, Sections 14.13 and 14.31 - */ - const std::string HEADERS_3("(Content-Length|Max-Forwards):[ \\t]?[0-9]{1,12}"); -} diff --git a/src/scan_lightgrep.cpp b/src/scan_lightgrep.cpp index a9615222..38f3b065 100644 --- a/src/scan_lightgrep.cpp +++ b/src/scan_lightgrep.cpp @@ -9,9 +9,8 @@ #include #include "be20_api/scanner_params.h" - -//#include "be20_api/beregex.h" -#include "histogram.h" +#include "be20_api/scanner_set.h" +#include "be20_api/histogram_def.h" #include "pattern_scanner.h" #include @@ -29,59 +28,54 @@ namespace { // local namespace hides these from other translation units }; virtual void startup(const scanner_params& sp) { - sp.info.set_name("scan_lightgrep"); + sp.info->set_name("lightgrep"); sp.info->author = "Jon Stewart"; sp.info->description = "Advanced search for patterns"; - sp.info->scanner_version = "0.2"; - sp.info->flags = scanner_info::SCANNER_FIND_SCANNER | scanner_info::SCANNER_FAST_FIND; - sp.info->feature_names.insert(name()); - sp.info->histogram_defs.insert(histogram_def( name(), "", "histogram", HistogramMaker::FLAG_LOWERCASE)); + sp.info->scanner_version = "2.0"; + sp.info->feature_defs.push_back(feature_recorder_def("lightgrep")); + sp.info->scanner_flags.find_scanner = true; + auto lowercase = histogram_def::flags_t(); + lowercase.lowercase = true; + sp.info->histogram_defs.push_back(histogram_def(name(), name(), "", "", "histogram", lowercase)); } virtual void init(const scanner_params& sp) { } - virtual void initScan(const scanner_params& sp) { - LgRec = &sp.named_feature_recorder(name()); - } - feature_recorder* LgRec; - void processHit(const LG_SearchHit& hit, const scanner_params& sp) { - LgRec->write_buf(sp.sbuf, hit.Start, hit.End - hit.Start); - } - private: FindScanner(const FindScanner& x): PatternScanner(x), LgRec(x.LgRec) {} FindScanner& operator=(const FindScanner&); }; - - FindScanner Scanner; - - CallbackFnType ProcessHit; } extern "C" void scan_lightgrep(struct scanner_params &sp) { + static std::unique_ptr lg_findscanner_ptr; + static std::unique_ptr lg_ptr; switch (sp.phase) { case scanner_params::PHASE_INIT: - Scanner.startup(sp); - ProcessHit = static_cast(&FindScanner::processHit); + lg_findscanner_ptr.reset(new FindScanner); + lg_findscanner_ptr->startup(sp); break; - case scanner_params::PHASE_INIT: + case scanner_params::PHASE_INIT2: { - Scanner.init(sp); - LightgrepController& lg(LightgrepController::Get()); - lg.addUserPatterns(Scanner, &ProcessHit, sp.ss->sc); // note: FindOpts now passed in ScannerConfig - lg.regcomp(); - break; + lg_findscanner_ptr->init(sp); + lg_ptr.reset(new LightgrepController); + if (!lg_ptr->addUserPatterns(*lg_findscanner_ptr, sp.ss->find_patterns(), sp.ss->find_files())) { + throw std::runtime_error("There was an error parsing the lightgrep scanner's patterns."); + } + lg_ptr->regcomp(); } + break; case scanner_params::PHASE_SCAN: - LightgrepController::Get().scan(sp); + lg_ptr->scan(sp); break; case scanner_params::PHASE_SHUTDOWN: - Scanner.shutdown(sp); + lg_findscanner_ptr.reset(); + lg_ptr.reset(); break; default: break;