Skip to content

Druid index_parallel ingestion failing in merge phase #19035

@Shreyansh1004

Description

@Shreyansh1004

I am trying to ingest data in druid datasource using index_parallel but the ingestion is getting failed again and again in merge phase

Here is the task spec :

{
"type": "index_parallel",
"spec": {
"dataSchema": {
"dataSource": "X",
"timestampSpec": {
"column": "timestamp",
"format": "auto"
},
"dimensionsSpec": {
"dimensions": [
"segment",
"region",
"deviceIdType",
"scope"
]
},
"metricsSpec": [
{
"fieldName": "X",
"type": "thetaSketch",
"name": "X",
"isInputThetaSketch": true,
"size": 65536
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "day",
"queryGranularity": "day",
"intervals": ["PLACEHOLDER"],
"rollup": false
}
},
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "combining",
"delegates": [
{
"type": "druid",
"dataSource": "X",
"interval": "PLACEHOLDER",
"filter": {
"type": "not",
"field": {
"type": "in",
"dimension": "X",
"values": ["X"]
}
}
},
}
}
]
},
"inputFormat": {
"type": "parquet",
"binaryAsString": false
},
"appendToExisting": false
},
"tuningConfig": {
"type": "index_parallel",
"forceGuaranteedRollup": true,
"maxRowsInMemory": 25000,
"maxBytesInMemory": 500000000,
"partitionsSpec": {
"type": "hashed",
"targetRowsPerSegment": 6000
},
"maxNumConcurrentSubTasks": 10,
"maxRetry": 3,
"taskStatusCheckPeriodMs": 1000,
"chatHandlerTimeout": "PT10S",
"chatHandlerNumRetries": 5,
"pushTimeout": 0,
"ignoreInvalidRows": true,
"buildV9Directly": true
}
}
}

I am using a single middle manager with r6gd.8xlarge instance type with 12 worker slots, with 4GB heap for middle manager and 20GB ( 16GB heap + 4GB direct ) for peon task

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions