Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
!/

/.*
/target

!/.gitignore
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,17 @@ Let's be honest, this problem isn't *that* hard, but it should be fun! All you n
Also, as an incentive to score well, we'll tweet the final leaderboard when this ends :).

#### Here's what you have to do to get the participation award:
- Submit a valid entry (see above)
- Submit a valid entry (see above)
- Star this repo
- Follow Tinybird on Twitter [@tinybirdco](https://twitter.com/tinybirdco) and/or LinkedIn
- Share your submission on Twitter/LinkedIn using #stalbnashackathon (tag us too!)

## Need help?
Join our [Community Slack](https://www.tinybird.co/join-our-slack-community)!

## The Solution
It's a very simple project using Lucene to compare the texts against the canonical "St. Albans" ;) 100% of effectivity without reinventing the wheel XD

![image](/img/the-code.png)

![image](/img/the-results.png)
Binary file added img/the-code.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/the-results.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion negatives.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
['St. Paul',
'Albans',
'Albna'
'Albna',
'St. Alberts',
'Alberta',
'St. Johnsbury, VT',
Expand Down
44 changes: 44 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>
<groupId>es.carmelonhaldon</groupId>
<artifactId>st-albnas-hackathon</artifactId>
<version>0.0.1-SNAPSHOT</version>

<properties>

<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.source>1.8</maven.compiler.source>

<jackson.version>2.13.3</jackson.version>
<lucene.version>7.4.0</lucene.version>

</properties>

<dependencies>

<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-xml</artifactId>
<version>${jackson.version}</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${lucene.version}</version>
</dependency>

</dependencies>

</project>
95 changes: 95 additions & 0 deletions src/main/java/es/carmelonhaldon/stalbnashackathon/Main.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package es.carmelonhaldon.stalbnashackathon;

import java.io.IOException;
import java.net.URL;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.exc.StreamReadException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.DatabindException;
import com.fasterxml.jackson.databind.ObjectMapper;

public class Main {

public static void main(String[] args) throws StreamReadException, DatabindException, IOException {

// carmelonhaldon: Compare "St. Albans"...
String stAlbans = "St. Albans";
IndexSearcher stAlbansSearcher = indexKeyword(stAlbans);

// carmelonhaldon: ... against positives...
List<String> positives = readJson("file:positives.txt");
System.out.printf("%npositives -> %d%n%n", positives.size());

for (String positive : positives) {

Query positiveQuery = new FuzzyQuery(new Term("text", positive));
TopDocs searchResults = stAlbansSearcher.search(positiveQuery, 1);

// carmelonhaldon: If found, can be replaced by "St. Albans"...
System.out.printf("%s -> %s%n", positive, searchResults.totalHits > 0 ? stAlbans : "KO");
}

// carmelonhaldon: ... and negatives.
List<String> negatives = readJson("file:negatives.txt");
System.out.printf("%nnegatives -> %d%n%n", negatives.size());

for (String negative : negatives) {

Query negativeQuery = new FuzzyQuery(new Term("text", negative));
TopDocs searchResults = stAlbansSearcher.search(negativeQuery, 1);

// carmelonhaldon: ... If not, "KO".
System.out.printf("%s -> %s%n", negative, searchResults.totalHits > 0 ? stAlbans : "KO");
}
}

// carmelonhaldon: https://riptutorial.com/lucene/example/20860/hello-world
// carmelonhaldon: As a "field" to classify, it's better to use the KeywordAnalyzer rathern than the StandardAnalyzer.
private static IndexSearcher indexKeyword(String keyword) throws IOException {

Directory directory = new RAMDirectory();
Analyzer analyzer = new KeywordAnalyzer();

IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);

Document document = new Document();
document.add(new TextField("text", keyword, Field.Store.YES));

indexWriter.addDocument(document);
indexWriter.close();

IndexReader indexReader = DirectoryReader.open(directory);
return new IndexSearcher(indexReader);
}

private static List<String> readJson(String json) throws StreamReadException, DatabindException, IOException {

URL url = new URL(json);

ObjectMapper objectMapper = new ObjectMapper();
objectMapper.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true);

return objectMapper.readValue(url, new TypeReference<List<String>>() {
});
}
}