The sample code to load a VCF file with Hail is failing for me when I try it on a single 6 Gb VCF from the whole genome pVCFs. Can you help me to resolve this error?

Here is my code, and its output.

I ran this by starting an analysis with a JupyterLab Spark Cluster with the default settings (mem1_hdd1_v2_x16 with 2 workers).

 

 

from pyspark.sql import SparkSession

import hail as hl

builder = (

  SparkSession

  .builder

  .enableHiveSupport()

)

spark = builder.getOrCreate()

hl.init(sc=spark.sparkContext)

 

file_url = 'file:///mnt/project/Bulk/Whole genome sequences/Whole genome GraphTyper joint call pVCF/*c10_b269_v1.vcf.gz'

# Matches 1 file of 6.5 Gb

 

# Import genomic data into a MatrixTable

mt = hl.import_vcf(file_url, 

          force_bgz=True, 

          reference_genome="GRCh38", 

          array_elements_required=False)

          

print(f"Num partitions: {mt.n_partitions()}")

mt.describe()

 

 

Output:

 

2022-09-30 14:25:30 Hail: INFO: Coerced prefix-sorted dataset

---------------------------------------------------------------------------

FatalError                Traceback (most recent call last)

<ipython-input-3-70af5881c720> in <module>

----> 1 print(f"Num partitions: {mt.n_partitions()}")

   2 mt.describe()

 

/opt/conda/lib/python3.6/site-packages/hail/matrixtable.py in n_partitions(self)

  3275       Number of partitions.

  3276     """

-> 3277     return Env.backend().execute(ir.MatrixToValueApply(self._mir, {'name': 'NPartitionsMatrixTable'}))

  3278 

  3279   @typecheck_method(n_partitions=int,

 

/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)

  108         raise HailUserError(message_and_trace) from None

  109 

--> 110       raise e

 

/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)

   84     # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))

   85     try:

---> 86       result_tuple = self._jhc.backend().executeEncode(jir, stream_codec)

   87       (result, timings) = (result_tuple._1(), result_tuple._2())

   88       value = ir.typ._from_encoding(result)

 

/cluster/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)

  1255     answer = self.gateway_client.send_command(command)

  1256     return_value = get_return_value(

-> 1257       answer, self.gateway_client, self.target_id, self.name)

  1258 

  1259     for temp_arg in temp_args:

 

/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)

   29         raise FatalError('%s\n\nJava stack trace:\n%s\n'

   30                 'Hail version: %s\n'

---> 31                 'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None

   32     except pyspark.sql.utils.CapturedException as e:

   33       raise FatalError('%s\n\nJava stack trace:\n%s\n'

 

FatalError: IllegalArgumentException: requirement failed

 

Java stack trace:

java.lang.IllegalArgumentException: requirement failed

at scala.Predef$.require(Predef.scala:212)

at is.hail.rvd.RVDPartitioner.<init>(RVDPartitioner.scala:52)

at is.hail.rvd.RVDPartitioner.extendKeySamePartitions(RVDPartitioner.scala:138)

at is.hail.expr.ir.LoweredTableReader$$anon$4.coerce(TableIR.scala:387)

at is.hail.expr.ir.GenericTableValue.toTableStage(GenericTableValue.scala:159)

at is.hail.io.vcf.MatrixVCFReader.lower(LoadVCF.scala:1790)

at is.hail.expr.ir.lowering.LowerTableIR$.is$hail$expr$ir$lowering$LowerTableIR$$lower$1(LowerTableIR.scala:403)

at is.hail.expr.ir.lowering.LowerTableIR$.apply(LowerTableIR.scala:1330)

at is.hail.expr.ir.lowering.LowerToCDA$.lower(LowerToCDA.scala:69)

at is.hail.expr.ir.lowering.LowerToCDA$.apply(LowerToCDA.scala:18)

at is.hail.expr.ir.lowering.LowerToDistributedArrayPass.transform(LoweringPass.scala:77)

at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:27)

at is.hail.expr.ir.LowerOrInterpretNonCompilable$.is$hail$expr$ir$LowerOrInterpretNonCompilable$$rewrite$1(LowerOrInterpretNonCompilable.scala:67)

at is.hail.expr.ir.LowerOrInterpretNonCompilable$.is$hail$expr$ir$LowerOrInterpretNonCompilable$$rewrite$1(LowerOrInterpretNonCompilable.scala:53)

at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:72)

at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:69)

at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:16)

at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:16)

at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)

at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:16)

at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:14)

at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)

at is.hail.expr.ir.lowering.LoweringPass$class.apply(LoweringPass.scala:14)

at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:64)

at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:15)

at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:13)

at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)

at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)

at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:13)

at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:47)

at is.hail.backend.spark.SparkBackend.is$hail$backend$spark$SparkBackend$$_execute(SparkBackend.scala:381)

at is.hail.backend.spark.SparkBackend$$anonfun$8$$anonfun$apply$4.apply(SparkBackend.scala:417)

at is.hail.backend.spark.SparkBackend$$anonfun$8$$anonfun$apply$4.apply(SparkBackend.scala:414)

at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)

at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)

at is.hail.utils.package$.using(package.scala:638)

at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:47)

at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:46)

at is.hail.utils.package$.using(package.scala:638)

at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)

at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:46)

at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)

at is.hail.backend.spark.SparkBackend$$anonfun$8.apply(SparkBackend.scala:414)

at is.hail.backend.spark.SparkBackend$$anonfun$8.apply(SparkBackend.scala:413)

at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)

at is.hail.backend.spark.SparkBackend.executeEncode(SparkBackend.scala:413)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

 

 

 

Hail version: 0.2.78-b17627756568

Error summary: IllegalArgumentException: requirement failed

 

Comments

3 comments

  • Comment author
    Ondrej Klempir DNAnexus Team

    I am observing the same behavior on my end for WGS pVCF.  I have informed RAP engineering team.

    0
  • Thank you very much!

    I hope this can be resolved soon. It seems like no commands that actually operate on the data using Hail are working.

    0
  • Comment author
    Former User of DNAx Community_47

    Hello - I'm just wondering if there has been any progress on this? It seems to be a major issue if the WGS data cannot be processed using Hail.

    0

Please sign in to leave a comment.