Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Build outputs (regenerated inside the builder stage)
build/
.gradle/
out/

# IDE / editor / OS
.idea/
.vscode/
*.iml
.DS_Store

# Runtime & logs (never needed at build time)
logs/
tmp/
*.log

# Research / developer helpers never copied into the image
script/
doc/

# Markdown & meta — not copied into the image
*.md
LICENSE
.claude/
.github/

# Note: .git/ is intentionally NOT listed — Dockerfile.datastet COPYs it so
# Gradle can stamp revision.txt from `git rev-parse`.
2 changes: 1 addition & 1 deletion .github/workflows/ci-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on: [push]

concurrency:
group: gradle
# cancel-in-progress: true
cancel-in-progress: true


jobs:
Expand Down
9 changes: 9 additions & 0 deletions Dockerfile.datastet
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@ ENV DATASTET_OPTS="-Djava.library.path=/opt/grobid/grobid-home/lib/lin-64:/usr/l

CMD ["./datastet/bin/datastet", "server", "datastet/resources/config/config.yml"]

# Container-level health probe backed by the model-aware /service/health
# endpoint, which returns HTTP 503 until every classifier reports "loaded".
# The 180s start-period covers the worst case with modelPreload=true, where
# SciBERT + four context classifiers + DataType classifier all load before
# the first probe. Using python3 (required by JEP/DeLFT so guaranteed
# present) to avoid a dependency on curl/wget from the base image.
HEALTHCHECK --interval=30s --timeout=5s --start-period=180s --retries=3 \
CMD python3 -c "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8060/service/health', timeout=4).status == 200 else 1)" || exit 1

LABEL \
authors="The contributors" \
org.label-schema.name="datastet" \
Expand Down
32 changes: 32 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,29 @@ apply plugin: 'java-library'
apply plugin: 'base'
apply plugin: 'com.github.kt3k.coveralls'

def getGitRevision() {
def gitRevision = "unknown"
try {
def result = providers.exec {
workingDir = rootProject.rootDir
commandLine 'git', 'describe', '--tags', '--always', '--first-parent'
}
gitRevision = result.standardOutput.asText.get().trim()
} catch (Exception e) {
println "Could not get Git revision: ${e}"
}
return gitRevision
}

project.ext.gitRevision = getGitRevision()

tasks.register('collectGitRevision') {
project.ext.gitRevision = getGitRevision()
doLast {
println "Git revision: ${project.ext.gitRevision}"
}
}

group = "org.grobid.datastet"
version = '0.9.0'

Expand Down Expand Up @@ -281,6 +304,15 @@ artifacts {
archives shadowJar
}

processResources {
filesMatching(["version.txt", "revision.txt"]) {
expand(
project_version: project.property('version') ?: "unknown",
project_revision: rootProject.ext.gitRevision
)
}
}

task copyModels(type: Copy) {
from "${rootDir}/resources/models"
include "**/*.wapiti"
Expand Down
61 changes: 15 additions & 46 deletions src/main/java/org/grobid/core/engines/DatasetDisambiguator.java
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ public boolean checkIfAlive() {
public void ensureCustomizationReady() {
boolean result = false;
URL url = null;
CloseableHttpResponse response = null;
try {
if ((nerd_port != null) && (nerd_port.length() > 0))
if (nerd_port.equals("443"))
Expand All @@ -157,24 +156,15 @@ public void ensureCustomizationReady() {
url = new URL("http://" + nerd_host + "/service/customisation/dataset");

LOGGER.debug("Calling: " + url.toString());
//System.out.println("Calling: " + url.toString());
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet get = new HttpGet(url.toString());
Scanner in = null;
try {
response = httpClient.execute(get);
//System.out.println(response.getStatusLine());
try (CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = httpClient.execute(get)) {
int code = response.getStatusLine().getStatusCode();
if (code != 200) {
LOGGER.info("Failed customization lookup service: HTTP error code : " + code + " - the customization will be loaded");
} else {
result = true;
}
} finally {
if (in != null)
in.close();
if (response != null)
response.close();
}
} catch (MalformedURLException e) {
LOGGER.warn("entity-fishing URL is malformed, customization skipped");
Expand All @@ -196,44 +186,34 @@ public void ensureCustomizationReady() {
url = new URL("http://" + nerd_host + "/service/customisations");

LOGGER.debug("Calling: " + url.toString());
//System.out.println("Calling: " + url.toString());
// load the dataset customisation
File cutomisationFile = new File("resources/config/customisation-dataset.json");
cutomisationFile = new File(cutomisationFile.getAbsolutePath());

String json = FileUtils.readFileToString(cutomisationFile, "UTF-8");

CloseableHttpClient httpClient = HttpClients.createDefault();
HttpPost post = new HttpPost(url.toString());

//StringBody stringValue = new StringBody(json, ContentType.MULTIPART_FORM_DATA);
//StringBody stringName = new StringBody("dataset", ContentType.MULTIPART_FORM_DATA);
MultipartEntityBuilder builder = MultipartEntityBuilder.create();
builder.setMode(HttpMultipartMode.BROWSER_COMPATIBLE);
builder.addTextBody("value", json);
builder.addTextBody("name", "dataset");
//builder.addPart("value", stringValue);
//builder.addPart("name", stringName);
HttpEntity entity = builder.build();
try {
post.setEntity(entity);
response = httpClient.execute(post);
//System.out.println(response.getStatusLine());
post.setEntity(entity);

try (CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = httpClient.execute(post)) {
int code = response.getStatusLine().getStatusCode();
if (code != 200) {
LOGGER.error("Failed loading dataset customisation: HTTP error code : " + code);
} else {
LOGGER.info("Dataset customisation loaded");
}
} finally {
if (response != null)
response.close();
}
} catch (MalformedURLException e) {
e.printStackTrace();
LOGGER.warn("MalformedURLException while loading dataset customisation", e);
} catch (IOException e) {
e.printStackTrace();
LOGGER.warn("I/O error while loading dataset customisation", e);
}
}
}
Expand Down Expand Up @@ -489,8 +469,6 @@ public String runNerd(List<Dataset> entities, List<LayoutToken> subtokens, Strin
url = new URL("http://" + nerd_host + ":" + nerd_port + "/service/" + RESOURCEPATH);
else
url = new URL("http://" + nerd_host + "/service/" + RESOURCEPATH);
//System.out.println("calling... " + url.toString());
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpPost post = new HttpPost(url.toString());
//post.addHeader("Content-Type", "application/json");
//post.addHeader("Accept", "application/json");
Expand Down Expand Up @@ -567,32 +545,23 @@ public String runNerd(List<Dataset> entities, List<LayoutToken> subtokens, Strin
builder.addPart("query", stringBody);
HttpEntity entity = builder.build();

CloseableHttpResponse response = null;
Scanner in = null;
try {
//post.setEntity(new UrlEncodedFormEntity(params));
post.setEntity(entity);
response = httpClient.execute(post);
// System.out.println(response.getStatusLine());

post.setEntity(entity);
try (CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = httpClient.execute(post)) {
int code = response.getStatusLine().getStatusCode();
if (code != 200) {
LOGGER.warn("entity-fishing annotation returned HTTP " + code + ", disambiguation skipped");
return null;
}

HttpEntity entityResp = response.getEntity();
in = new Scanner(entityResp.getContent());
while (in.hasNext()) {
output.append(in.next());
output.append(" ");
try (Scanner in = new Scanner(entityResp.getContent())) {
while (in.hasNext()) {
output.append(in.next());
output.append(" ");
}
}
EntityUtils.consume(entityResp);
} finally {
if (in != null)
in.close();
if (response != null)
response.close();
}
} catch (MalformedURLException e) {
LOGGER.warn("entity-fishing URL is malformed, disambiguation skipped");
Expand Down
Loading
Loading