Issues Accessing .bgen Files and Role of .bgi Files in LD Clumping
I’m encountering issues with accessing .bgen files in JupyterLab while performing LD clumping. Specifically:
- Using
bgenix, I get errors and empty outputs when processing.bgenfiles, despite confirming their presence. - I understand
.bgifiles are used bybgenix, but are they strictly necessary for LD clumping in PLINK?
Could the issue be related to JupyterLab configurations (e.g., file size limits or paths)?
Any guidance on addressing these issues would be much appreciated!
Comments
2 comments
Hello Alexander,
Can you provide some more information on the commands used and error messages you are having?
Thank you for your reply.
i keep getting this :
Processing chromosome 1
Processing 8 variants
Batch 1: 8 variants
Extracting variants from BGEN file
Error processing chromosome 1 batch 0: Command '['./bgen.tgz/build/apps/bgenix', '-g', 'tmp/ukb21008_c1_b0_v1.bgen', '-incl-rsids', 'rs_batch.txt']' died with <Signals.SIGABRT: 6>.
Error output:
for all chromosomes.
I moved the imputed bgen and sample files to REGENIE_output directory on the DNAnexus platform so files are there.
here is the rest of relevant code:
%%bash
# Create symlink for imputed data with correct path
DIR='/REGENIE_output/Imputation from genotype (GEL)'
ln -sf "$DIR" /opt/notebooks/imputed
# Same directory for samples since they're in the same location
ln -sf "$DIR" /opt/notebooks/samples
# QC SNP list location
ln -sf '/REGENIE_output' /opt/notebooks/keepsnps
# Verify symlinks
ls -l /opt/notebooks/imputed
ls -l /opt/notebooks/samples
ls -l /opt/notebooks/keepsnps
# Create temporary directory
if os.path.exists('tmp'):
shutil.rmtree('tmp')
os.mkdir('tmp')
imputation_folder = 'Imputation from genotype (GEL)'
output_dir = '/Data/'
# Process chromosomes
chromosomes = variants_to_analyze['Chr'].unique()
BATCH_SIZE = 10000
# Using symlinks for imputed data
for chromosome in sorted(chromosomes, key=lambda x: int(x)):
print(f"\nProcessing chromosome {chromosome}")
# Define file names
bgen_file = f"ukb21008_c{chromosome}_b0_v1.bgen"
sample_file = f"ukb21008_c{chromosome}_b0_v1.sample"
# Use symlink
bgen_symlink = f"tmp/{bgen_file}"
sample_symlink = f"tmp/{sample_file}"
try:
if not os.path.exists(bgen_symlink):
os.symlink(os.path.join(imputation_folder, bgen_file), bgen_symlink)
if not os.path.exists(sample_symlink):
os.symlink(os.path.join(imputation_folder, sample_file), sample_symlink)
except FileExistsError:
print(f"Symlink for chromosome {chromosome} already exists, skipping creation.")
rs_names = variants_to_analyze[variants_to_analyze['Chr'] == chromosome]['Name']
print(f"Processing {len(rs_names)} variants")
# Process variants
for i, start_idx in enumerate(range(0, len(rs_names), BATCH_SIZE)):
rs_names_batch = rs_names[start_idx:start_idx + BATCH_SIZE]
print(f" Batch {i+1}: {len(rs_names_batch)} variants")
rs_names_batch.to_csv('rs_batch.txt', header=False, index=False)
new_bgen_name = f'tmp/{chromosome}_{i}.bgen'
plink_output_prefix = f'tmp/plink_{chromosome}_{i}'
plink_ld_output_prefix = f'tmp/plink_{chromosome}_{i}_ld_clumped'
try:
print('Extracting variants from BGEN file')
with open(new_bgen_name, 'wb') as new_bgen:
subprocess.check_call([
'./bgen.tgz/build/apps/bgenix',
'-g', bgen_symlink,
'-incl-rsids', 'rs_batch.txt'
], stdout=new_bgen, stderr=subprocess.PIPE)
time.sleep(1) # Adding delay to prevent system overload
print('Converting to PLINK format')
subprocess.check_call([
'./plink2',
'--bgen', new_bgen_name, 'ref-first',
'--sample', sample_symlink,
'--rm-dup', 'force-first',
'--out', plink_output_prefix,
'--keep-fam', 'eids_to_keep.txt',
'--make-bed'
])
time.sleep(1) # Adding delay to prevent system overload
print('Performing LD clumping')
subprocess.check_call([
'./plink',
'--bfile', plink_output_prefix,
'--extract', 'train_gel_imputed_snps_data_qc_pass.snplist',
'--keep-fam', 'eids_to_keep.txt',
'--clump-p1', str(suggestive_threshold),
'--clump-r2', '0.1',
'--clump-kb', '250',
'--clump', 'variants_to_analyze.txt',
'--clump-snp-field', 'Name',
'--clump-field', 'Pval',
'--out', f'{plink_ld_output_prefix}_ld_clumped'
])
time.sleep(1) # Adding delay to prevent system overload
except subprocess.CalledProcessError as e:
print(f"Error processing chromosome {chromosome} batch {i}: {e}")
if hasattr(e, 'stderr'):
print(f"Error output: {e.stderr.decode() if e.stderr else ''}")
continue
except Exception as e:
print(f"Unexpected error processing chromosome {chromosome} batch {i}: {e}")
continue
# Clean up batch files
if os.path.exists(new_bgen_name):
os.remove(new_bgen_name)
if os.path.exists('rs_batch.txt'):
os.remove('rs_batch.txt')
# Clean up symlinks after processing
if os.path.exists(bgen_symlink):
os.remove(bgen_symlink)
if os.path.exists(sample_symlink):
os.remove(sample_symlink)
print("\nProcessing completed.")
Please sign in to leave a comment.