The sample code to load a VCF file with Hail is failing for me when I try it on a single 6 Gb VCF from the whole genome pVCFs. Can you help me to resolve this error?
Here is my code, and its output.
I ran this by starting an analysis with a JupyterLab Spark Cluster with the default settings (mem1_hdd1_v2_x16 with 2 workers).
from pyspark.sql import SparkSession
import hail as hl
builder = (
SparkSession
.builder
.enableHiveSupport()
)
spark = builder.getOrCreate()
hl.init(sc=spark.sparkContext)
file_url = 'file:///mnt/project/Bulk/Whole genome sequences/Whole genome GraphTyper joint call pVCF/*c10_b269_v1.vcf.gz'
# Matches 1 file of 6.5 Gb
# Import genomic data into a MatrixTable
mt = hl.import_vcf(file_url,
force_bgz=True,
reference_genome="GRCh38",
array_elements_required=False)
print(f"Num partitions: {mt.n_partitions()}")
mt.describe()
Output:
2022-09-30 14:25:30 Hail: INFO: Coerced prefix-sorted dataset
---------------------------------------------------------------------------
FatalError Traceback (most recent call last)
<ipython-input-3-70af5881c720> in <module>
----> 1 print(f"Num partitions: {mt.n_partitions()}")
2 mt.describe()
/opt/conda/lib/python3.6/site-packages/hail/matrixtable.py in n_partitions(self)
3275 Number of partitions.
3276 """
-> 3277 return Env.backend().execute(ir.MatrixToValueApply(self._mir, {'name': 'NPartitionsMatrixTable'}))
3278
3279 @typecheck_method(n_partitions=int,
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
108 raise HailUserError(message_and_trace) from None
109
--> 110 raise e
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
84 # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
85 try:
---> 86 result_tuple = self._jhc.backend().executeEncode(jir, stream_codec)
87 (result, timings) = (result_tuple._1(), result_tuple._2())
88 value = ir.typ._from_encoding(result)
/cluster/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
29 raise FatalError('%s\n\nJava stack trace:\n%s\n'
30 'Hail version: %s\n'
---> 31 'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None
32 except pyspark.sql.utils.CapturedException as e:
33 raise FatalError('%s\n\nJava stack trace:\n%s\n'
FatalError: IllegalArgumentException: requirement failed
Java stack trace:
java.lang.IllegalArgumentException: requirement failed
at scala.Predef$.require(Predef.scala:212)
at is.hail.rvd.RVDPartitioner.<init>(RVDPartitioner.scala:52)
at is.hail.rvd.RVDPartitioner.extendKeySamePartitions(RVDPartitioner.scala:138)
at is.hail.expr.ir.LoweredTableReader$$anon$4.coerce(TableIR.scala:387)
at is.hail.expr.ir.GenericTableValue.toTableStage(GenericTableValue.scala:159)
at is.hail.io.vcf.MatrixVCFReader.lower(LoadVCF.scala:1790)
at is.hail.expr.ir.lowering.LowerTableIR$.is$hail$expr$ir$lowering$LowerTableIR$$lower$1(LowerTableIR.scala:403)
at is.hail.expr.ir.lowering.LowerTableIR$.apply(LowerTableIR.scala:1330)
at is.hail.expr.ir.lowering.LowerToCDA$.lower(LowerToCDA.scala:69)
at is.hail.expr.ir.lowering.LowerToCDA$.apply(LowerToCDA.scala:18)
at is.hail.expr.ir.lowering.LowerToDistributedArrayPass.transform(LoweringPass.scala:77)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:27)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.is$hail$expr$ir$LowerOrInterpretNonCompilable$$rewrite$1(LowerOrInterpretNonCompilable.scala:67)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.is$hail$expr$ir$LowerOrInterpretNonCompilable$$rewrite$1(LowerOrInterpretNonCompilable.scala:53)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:72)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:69)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:16)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:16)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:16)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:14)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass$class.apply(LoweringPass.scala:14)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:64)
at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:15)
at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:13)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:13)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:47)
at is.hail.backend.spark.SparkBackend.is$hail$backend$spark$SparkBackend$$_execute(SparkBackend.scala:381)
at is.hail.backend.spark.SparkBackend$$anonfun$8$$anonfun$apply$4.apply(SparkBackend.scala:417)
at is.hail.backend.spark.SparkBackend$$anonfun$8$$anonfun$apply$4.apply(SparkBackend.scala:414)
at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)
at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:47)
at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:46)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:46)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)
at is.hail.backend.spark.SparkBackend$$anonfun$8.apply(SparkBackend.scala:414)
at is.hail.backend.spark.SparkBackend$$anonfun$8.apply(SparkBackend.scala:413)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.backend.spark.SparkBackend.executeEncode(SparkBackend.scala:413)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Hail version: 0.2.78-b17627756568
Error summary: IllegalArgumentException: requirement failed
Comments
3 comments
I am observing the same behavior on my end for WGS pVCF. I have informed RAP engineering team.
Thank you very much!
I hope this can be resolved soon. It seems like no commands that actually operate on the data using Hail are working.
Hello - I'm just wondering if there has been any progress on this? It seems to be a major issue if the WGS data cannot be processed using Hail.
Please sign in to leave a comment.