Module pygwasvcf.gwas_vcf
Expand source code
from pysam import VariantFile
import pygwasvcf
import sqlite3
import os
"""
Class to parse GWAS-VCF file using pysam
"""
class GwasVcf:
def __init__(self, file_path, rsidx_path=None):
"""
Constructor for GwasVcf class
:param file_path: Path to GWAS-VCF
:param rsidx_path: Path to RSIDX (optional)
"""
self.__file_path = file_path
self.__rsidx_path = rsidx_path
self.__vcf = None
def __enter__(self):
"""
Open VariantFile using with resources
"""
self.__vcf = VariantFile(self.__file_path)
return self
def __exit__(self, type, value, traceback):
"""
Close GWAS-VCF file handle
"""
self.__vcf.close()
self.__vcf = None
def is_closed(self):
return self.__vcf is None
@staticmethod
def format_variant_record_for_rsidx(rec):
"""
Function to extract data for RSIDX query
:param rec: pysam.VariantRecord
:return: rsid: dbSNP identifier as integer (rs removed)
:return: chrom: chromosome for association
:return: pos: position for association (1-based)
"""
for assoc in rec.samples:
var_id = rec.samples[assoc]['ID']
if var_id is not None:
for rsid in var_id.split(";"):
if rsid[0:2] == "rs":
yield int(rsid[2:]), rec.chrom, rec.pos
def index_rsid(self):
"""
Index GWAS-VCF using rsID and pysam adapted from [rsidx](https://github.com/bioforensics/rsidx)
"""
if self.__vcf is None:
raise ValueError("Cannot use methods on the VCF object before the file is open.")
idx_path = self.__file_path + ".rsidx"
if os.path.exists(idx_path):
os.remove(idx_path)
with sqlite3.connect(idx_path) as dbconn:
# prepare database
c = dbconn.cursor()
c.execute(
'CREATE TABLE rsid_to_coord ('
'rsid INTEGER PRIMARY KEY, '
'chrom TEXT NULL DEFAULT NULL, '
'coord INTEGER NOT NULL DEFAULT 0)'
)
dbconn.commit()
# add variant records to SQLite DB
for rec in self.__vcf.fetch():
c.executemany('INSERT OR IGNORE INTO rsid_to_coord VALUES (?,?,?)',
GwasVcf.format_variant_record_for_rsidx(rec))
dbconn.commit()
self.__rsidx_path = idx_path
def get_metadata(self):
"""
Extract metadata about the GWAS trait(s)
:return: res: Dict of Dict containing a key=value pairs for each trait in the GWAS-VCF
"""
if self.__vcf is None:
raise ValueError("Cannot use methods on the VCF object before the file is open.")
res = dict()
for rec in self.__vcf.header.records:
if rec.key == "SAMPLE":
res[rec['ID']] = dict()
for k in rec:
if k != "ID":
res[rec['ID']][k] = rec[k]
return res
def get_traits(self):
"""
Extract list of traits in the GWAS-VCF
:return: traits: List of traits
"""
return list(self.__vcf.header.samples)
def get_location_from_rsid(self, rsid):
"""
Helper function to convert rsID to chromosome and position using [rsidx](https://github.com/bioforensics/rsidx)
:param rsid: dbsnp indentifier
:return: res: chromosome
:return: res: position (1-based)
"""
if not rsid.startswith("rs"):
raise ValueError("Variant ID query must be an rsID")
if self.__rsidx_path is None:
raise ValueError("Cannot query by variant identifier without providing an rsidx")
q = (int(rsid[2:]),)
with sqlite3.connect(self.__rsidx_path) as dbconn:
cur = dbconn.cursor()
cur.execute('SELECT DISTINCT chrom,coord FROM rsid_to_coord WHERE rsid =?', q)
res = cur.fetchone()
return res[0], res[1]
def query(self, contig=None, start=None, stop=None, variant_id=None, exclude_filtered=True):
"""
Variant-trait association query function
:param contig: Chromosome to query
:param start: Start position of interval (0-based)
:param stop: End position of interval (0-based)
:param variant_id: rsID to query using [rsidx](https://github.com/bioforensics/rsidx)
:param exclude_filtered: Boolean flag to remove record that do not meet QC
:return: rec: pysam.VariantRecord object containing chromosome, position, alleles, association statistics
"""
if self.__vcf is None:
raise ValueError("Cannot use methods on the VCF object before the file is open.")
if variant_id is not None:
if contig is not None or start is not None or stop is not None:
raise ValueError("Cannot provide chromosome, start or end with variant ID. Choose one query.")
contig, pos = self.get_location_from_rsid(variant_id)
start = pos - 1
stop = pos
# extract variant(s) from GWAS-VCF
for rec in self.__vcf.fetch(contig=contig, start=start, stop=stop):
# check multiallelics are on separate rows which is required for functions
pygwasvcf.VariantRecordGwasFuns.check_biallelic(rec)
# skip variants not meeting filter requirements
if exclude_filtered and rec.filter.keys()[0] != "PASS":
continue
# lazy return record
yield rec
Classes
class GwasVcf (file_path, rsidx_path=None)
-
Constructor for GwasVcf class :param file_path: Path to GWAS-VCF :param rsidx_path: Path to RSIDX (optional)
Expand source code
class GwasVcf: def __init__(self, file_path, rsidx_path=None): """ Constructor for GwasVcf class :param file_path: Path to GWAS-VCF :param rsidx_path: Path to RSIDX (optional) """ self.__file_path = file_path self.__rsidx_path = rsidx_path self.__vcf = None def __enter__(self): """ Open VariantFile using with resources """ self.__vcf = VariantFile(self.__file_path) return self def __exit__(self, type, value, traceback): """ Close GWAS-VCF file handle """ self.__vcf.close() self.__vcf = None def is_closed(self): return self.__vcf is None @staticmethod def format_variant_record_for_rsidx(rec): """ Function to extract data for RSIDX query :param rec: pysam.VariantRecord :return: rsid: dbSNP identifier as integer (rs removed) :return: chrom: chromosome for association :return: pos: position for association (1-based) """ for assoc in rec.samples: var_id = rec.samples[assoc]['ID'] if var_id is not None: for rsid in var_id.split(";"): if rsid[0:2] == "rs": yield int(rsid[2:]), rec.chrom, rec.pos def index_rsid(self): """ Index GWAS-VCF using rsID and pysam adapted from [rsidx](https://github.com/bioforensics/rsidx) """ if self.__vcf is None: raise ValueError("Cannot use methods on the VCF object before the file is open.") idx_path = self.__file_path + ".rsidx" if os.path.exists(idx_path): os.remove(idx_path) with sqlite3.connect(idx_path) as dbconn: # prepare database c = dbconn.cursor() c.execute( 'CREATE TABLE rsid_to_coord (' 'rsid INTEGER PRIMARY KEY, ' 'chrom TEXT NULL DEFAULT NULL, ' 'coord INTEGER NOT NULL DEFAULT 0)' ) dbconn.commit() # add variant records to SQLite DB for rec in self.__vcf.fetch(): c.executemany('INSERT OR IGNORE INTO rsid_to_coord VALUES (?,?,?)', GwasVcf.format_variant_record_for_rsidx(rec)) dbconn.commit() self.__rsidx_path = idx_path def get_metadata(self): """ Extract metadata about the GWAS trait(s) :return: res: Dict of Dict containing a key=value pairs for each trait in the GWAS-VCF """ if self.__vcf is None: raise ValueError("Cannot use methods on the VCF object before the file is open.") res = dict() for rec in self.__vcf.header.records: if rec.key == "SAMPLE": res[rec['ID']] = dict() for k in rec: if k != "ID": res[rec['ID']][k] = rec[k] return res def get_traits(self): """ Extract list of traits in the GWAS-VCF :return: traits: List of traits """ return list(self.__vcf.header.samples) def get_location_from_rsid(self, rsid): """ Helper function to convert rsID to chromosome and position using [rsidx](https://github.com/bioforensics/rsidx) :param rsid: dbsnp indentifier :return: res: chromosome :return: res: position (1-based) """ if not rsid.startswith("rs"): raise ValueError("Variant ID query must be an rsID") if self.__rsidx_path is None: raise ValueError("Cannot query by variant identifier without providing an rsidx") q = (int(rsid[2:]),) with sqlite3.connect(self.__rsidx_path) as dbconn: cur = dbconn.cursor() cur.execute('SELECT DISTINCT chrom,coord FROM rsid_to_coord WHERE rsid =?', q) res = cur.fetchone() return res[0], res[1] def query(self, contig=None, start=None, stop=None, variant_id=None, exclude_filtered=True): """ Variant-trait association query function :param contig: Chromosome to query :param start: Start position of interval (0-based) :param stop: End position of interval (0-based) :param variant_id: rsID to query using [rsidx](https://github.com/bioforensics/rsidx) :param exclude_filtered: Boolean flag to remove record that do not meet QC :return: rec: pysam.VariantRecord object containing chromosome, position, alleles, association statistics """ if self.__vcf is None: raise ValueError("Cannot use methods on the VCF object before the file is open.") if variant_id is not None: if contig is not None or start is not None or stop is not None: raise ValueError("Cannot provide chromosome, start or end with variant ID. Choose one query.") contig, pos = self.get_location_from_rsid(variant_id) start = pos - 1 stop = pos # extract variant(s) from GWAS-VCF for rec in self.__vcf.fetch(contig=contig, start=start, stop=stop): # check multiallelics are on separate rows which is required for functions pygwasvcf.VariantRecordGwasFuns.check_biallelic(rec) # skip variants not meeting filter requirements if exclude_filtered and rec.filter.keys()[0] != "PASS": continue # lazy return record yield rec
Static methods
def format_variant_record_for_rsidx(rec)
-
Function to extract data for RSIDX query :param rec: pysam.VariantRecord :return: rsid: dbSNP identifier as integer (rs removed) :return: chrom: chromosome for association :return: pos: position for association (1-based)
Expand source code
@staticmethod def format_variant_record_for_rsidx(rec): """ Function to extract data for RSIDX query :param rec: pysam.VariantRecord :return: rsid: dbSNP identifier as integer (rs removed) :return: chrom: chromosome for association :return: pos: position for association (1-based) """ for assoc in rec.samples: var_id = rec.samples[assoc]['ID'] if var_id is not None: for rsid in var_id.split(";"): if rsid[0:2] == "rs": yield int(rsid[2:]), rec.chrom, rec.pos
Methods
def get_location_from_rsid(self, rsid)
-
Helper function to convert rsID to chromosome and position using rsidx :param rsid: dbsnp indentifier :return: res: chromosome :return: res: position (1-based)
Expand source code
def get_location_from_rsid(self, rsid): """ Helper function to convert rsID to chromosome and position using [rsidx](https://github.com/bioforensics/rsidx) :param rsid: dbsnp indentifier :return: res: chromosome :return: res: position (1-based) """ if not rsid.startswith("rs"): raise ValueError("Variant ID query must be an rsID") if self.__rsidx_path is None: raise ValueError("Cannot query by variant identifier without providing an rsidx") q = (int(rsid[2:]),) with sqlite3.connect(self.__rsidx_path) as dbconn: cur = dbconn.cursor() cur.execute('SELECT DISTINCT chrom,coord FROM rsid_to_coord WHERE rsid =?', q) res = cur.fetchone() return res[0], res[1]
def get_metadata(self)
-
Extract metadata about the GWAS trait(s) :return: res: Dict of Dict containing a key=value pairs for each trait in the GWAS-VCF
Expand source code
def get_metadata(self): """ Extract metadata about the GWAS trait(s) :return: res: Dict of Dict containing a key=value pairs for each trait in the GWAS-VCF """ if self.__vcf is None: raise ValueError("Cannot use methods on the VCF object before the file is open.") res = dict() for rec in self.__vcf.header.records: if rec.key == "SAMPLE": res[rec['ID']] = dict() for k in rec: if k != "ID": res[rec['ID']][k] = rec[k] return res
def get_traits(self)
-
Extract list of traits in the GWAS-VCF :return: traits: List of traits
Expand source code
def get_traits(self): """ Extract list of traits in the GWAS-VCF :return: traits: List of traits """ return list(self.__vcf.header.samples)
def index_rsid(self)
-
Index GWAS-VCF using rsID and pysam adapted from rsidx
Expand source code
def index_rsid(self): """ Index GWAS-VCF using rsID and pysam adapted from [rsidx](https://github.com/bioforensics/rsidx) """ if self.__vcf is None: raise ValueError("Cannot use methods on the VCF object before the file is open.") idx_path = self.__file_path + ".rsidx" if os.path.exists(idx_path): os.remove(idx_path) with sqlite3.connect(idx_path) as dbconn: # prepare database c = dbconn.cursor() c.execute( 'CREATE TABLE rsid_to_coord (' 'rsid INTEGER PRIMARY KEY, ' 'chrom TEXT NULL DEFAULT NULL, ' 'coord INTEGER NOT NULL DEFAULT 0)' ) dbconn.commit() # add variant records to SQLite DB for rec in self.__vcf.fetch(): c.executemany('INSERT OR IGNORE INTO rsid_to_coord VALUES (?,?,?)', GwasVcf.format_variant_record_for_rsidx(rec)) dbconn.commit() self.__rsidx_path = idx_path
def is_closed(self)
-
Expand source code
def is_closed(self): return self.__vcf is None
def query(self, contig=None, start=None, stop=None, variant_id=None, exclude_filtered=True)
-
Variant-trait association query function :param contig: Chromosome to query :param start: Start position of interval (0-based) :param stop: End position of interval (0-based) :param variant_id: rsID to query using rsidx :param exclude_filtered: Boolean flag to remove record that do not meet QC :return: rec: pysam.VariantRecord object containing chromosome, position, alleles, association statistics
Expand source code
def query(self, contig=None, start=None, stop=None, variant_id=None, exclude_filtered=True): """ Variant-trait association query function :param contig: Chromosome to query :param start: Start position of interval (0-based) :param stop: End position of interval (0-based) :param variant_id: rsID to query using [rsidx](https://github.com/bioforensics/rsidx) :param exclude_filtered: Boolean flag to remove record that do not meet QC :return: rec: pysam.VariantRecord object containing chromosome, position, alleles, association statistics """ if self.__vcf is None: raise ValueError("Cannot use methods on the VCF object before the file is open.") if variant_id is not None: if contig is not None or start is not None or stop is not None: raise ValueError("Cannot provide chromosome, start or end with variant ID. Choose one query.") contig, pos = self.get_location_from_rsid(variant_id) start = pos - 1 stop = pos # extract variant(s) from GWAS-VCF for rec in self.__vcf.fetch(contig=contig, start=start, stop=stop): # check multiallelics are on separate rows which is required for functions pygwasvcf.VariantRecordGwasFuns.check_biallelic(rec) # skip variants not meeting filter requirements if exclude_filtered and rec.filter.keys()[0] != "PASS": continue # lazy return record yield rec