diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..220a57b --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +!/ + +/.* +/target + +!/.gitignore diff --git a/README.md b/README.md index 957eaae..bd88a08 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,17 @@ Let's be honest, this problem isn't *that* hard, but it should be fun! All you n Also, as an incentive to score well, we'll tweet the final leaderboard when this ends :). #### Here's what you have to do to get the participation award: -- Submit a valid entry (see above) +- Submit a valid entry (see above) - Star this repo - Follow Tinybird on Twitter [@tinybirdco](https://twitter.com/tinybirdco) and/or LinkedIn - Share your submission on Twitter/LinkedIn using #stalbnashackathon (tag us too!) ## Need help? Join our [Community Slack](https://www.tinybird.co/join-our-slack-community)! + +## The Solution +It's a very simple project using Lucene to compare the texts against the canonical "St. Albans" ;) 100% of effectivity without reinventing the wheel XD + +![image](/img/the-code.png) + +![image](/img/the-results.png) diff --git a/img/the-code.png b/img/the-code.png new file mode 100644 index 0000000..36d9cde Binary files /dev/null and b/img/the-code.png differ diff --git a/img/the-results.png b/img/the-results.png new file mode 100644 index 0000000..73c7e38 Binary files /dev/null and b/img/the-results.png differ diff --git a/negatives.txt b/negatives.txt index afa2bf2..737fbaf 100644 --- a/negatives.txt +++ b/negatives.txt @@ -1,6 +1,6 @@ ['St. Paul', 'Albans', - 'Albna' + 'Albna', 'St. Alberts', 'Alberta', 'St. Johnsbury, VT', diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..97c62e3 --- /dev/null +++ b/pom.xml @@ -0,0 +1,44 @@ + + + 4.0.0 + es.carmelonhaldon + st-albnas-hackathon + 0.0.1-SNAPSHOT + + + + 1.8 + 1.8 + + 2.13.3 + 7.4.0 + + + + + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + + + com.fasterxml.jackson.dataformat + jackson-dataformat-xml + ${jackson.version} + + + + org.apache.lucene + lucene-core + ${lucene.version} + + + org.apache.lucene + lucene-analyzers-common + ${lucene.version} + + + + + diff --git a/src/main/java/es/carmelonhaldon/stalbnashackathon/Main.java b/src/main/java/es/carmelonhaldon/stalbnashackathon/Main.java new file mode 100644 index 0000000..5eb689d --- /dev/null +++ b/src/main/java/es/carmelonhaldon/stalbnashackathon/Main.java @@ -0,0 +1,95 @@ +package es.carmelonhaldon.stalbnashackathon; + +import java.io.IOException; +import java.net.URL; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.exc.StreamReadException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.DatabindException; +import com.fasterxml.jackson.databind.ObjectMapper; + +public class Main { + + public static void main(String[] args) throws StreamReadException, DatabindException, IOException { + + // carmelonhaldon: Compare "St. Albans"... + String stAlbans = "St. Albans"; + IndexSearcher stAlbansSearcher = indexKeyword(stAlbans); + + // carmelonhaldon: ... against positives... + List positives = readJson("file:positives.txt"); + System.out.printf("%npositives -> %d%n%n", positives.size()); + + for (String positive : positives) { + + Query positiveQuery = new FuzzyQuery(new Term("text", positive)); + TopDocs searchResults = stAlbansSearcher.search(positiveQuery, 1); + + // carmelonhaldon: If found, can be replaced by "St. Albans"... + System.out.printf("%s -> %s%n", positive, searchResults.totalHits > 0 ? stAlbans : "KO"); + } + + // carmelonhaldon: ... and negatives. + List negatives = readJson("file:negatives.txt"); + System.out.printf("%nnegatives -> %d%n%n", negatives.size()); + + for (String negative : negatives) { + + Query negativeQuery = new FuzzyQuery(new Term("text", negative)); + TopDocs searchResults = stAlbansSearcher.search(negativeQuery, 1); + + // carmelonhaldon: ... If not, "KO". + System.out.printf("%s -> %s%n", negative, searchResults.totalHits > 0 ? stAlbans : "KO"); + } + } + + // carmelonhaldon: https://riptutorial.com/lucene/example/20860/hello-world + // carmelonhaldon: As a "field" to classify, it's better to use the KeywordAnalyzer rathern than the StandardAnalyzer. + private static IndexSearcher indexKeyword(String keyword) throws IOException { + + Directory directory = new RAMDirectory(); + Analyzer analyzer = new KeywordAnalyzer(); + + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); + IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig); + + Document document = new Document(); + document.add(new TextField("text", keyword, Field.Store.YES)); + + indexWriter.addDocument(document); + indexWriter.close(); + + IndexReader indexReader = DirectoryReader.open(directory); + return new IndexSearcher(indexReader); + } + + private static List readJson(String json) throws StreamReadException, DatabindException, IOException { + + URL url = new URL(json); + + ObjectMapper objectMapper = new ObjectMapper(); + objectMapper.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true); + + return objectMapper.readValue(url, new TypeReference>() { + }); + } +}