Source code for grasp.db

"""
Functions for managing the GRASP database.

`get_session()` is used everywhere in the module to create a connection to the
database. `initialize_database()` is used to build the database from the GRASP
file. It takes about an hour 90 minutes to run and will overwrite any existing
database.
"""
import os as _os
from re   import compile  as _recompile
from bz2  import open     as _bopen
from gzip import open     as _zopen
from html import unescape as _unescape

# Date parsing
from datetime import date as _date
from dateutil.parser import parse as _dateparse

# Database
from sqlalchemy     import create_engine as _create_engine
from sqlalchemy.sql import select        as _select
from sqlalchemy.orm import sessionmaker  as _sessionmaker

# Progress bar
from tqdm import tqdm as _tqdm

# Table Declarations
from .tables import Base as _Base
from .tables import Study as _Study
from .tables import SNP as _SNP
from .tables import Phenotype as _Phenotype
from .tables import PhenoCats as _PhenoCats
from .tables import Platform as _Platform
from .tables import Population as _Population
from .tables import snp_pheno_assoc as _snp_pheno_assoc
from .tables import study_pheno_assoc as _study_pheno_assoc
from .tables import study_plat_assoc as _study_plat_assoc

# Database config
from .config import config as _config

# Reference objects
from .ref import PopFlag as _PopFlag
from .ref import pheno_synonyms
from .ref import pop_correction

__all__ = ["get_session", "initialize_database"]

###############################################################################
#                               Core Functions                                #
###############################################################################


[docs]def get_session(echo=False):
    """Return a session and engine, uses config file.

    Args:
        echo: Echo all SQL to the console.

    Returns:
        session, engine: A SQLAlchemy session and engine object corresponding
                         to the grasp database for use in querying.
    """
    db_type = _config['DEFAULT']['DatabaseType']
    if db_type == 'sqlite':
        engine_string = ('sqlite:///{db_file}'
                         .format(db_file=_config['sqlite']['DatabaseFile']))
    else:
        engine_string = ('{type}://{user}:{passwd}@{host}/grasp'
                         .format(type=db_type,
                                 user=_config['other']['DatabaseUser'],
                                 passwd=_config['other']['DatabasePass'],
                                 host=_config['other']['DatabaseHost']))
    engine  = _create_engine(engine_string, echo=echo)
    Session = _sessionmaker(bind=engine)
    return Session(), engine


[docs]def initialize_database(study_file, grasp_file, commit_every=250000,
                        progress=False):
    """Create the database quickly.

    :study_file:   Tab delimited GRASP study file, available here:
                   `<github.com/MikeDacre/grasp/blob/master/grasp_studies.txt>`_
    :grasp_file:   Tab delimited GRASP file.
    :commit_every: How many rows to go through before commiting to disk.
    :progress:     Display a progress bar (db length hard coded).

    """
    rows        = 0
    count       = commit_every
    pphenos     = {}
    phenos      = {}
    platforms   = {}
    populations = {}


    # Create tables
    _, engine = get_session()
    print('Dropping and creating database tables, this may take a while if',
          'the old database is large.')
    if _config['DEFAULT']['DatabaseType'] == 'sqlite':
        cfile = _os.path.isfile(_config['sqlite']['DatabaseFile'])
        if _os.path.isfile(cfile):
            _os.remove(cfile)
    _Base.metadata.drop_all(engine)
    _Base.metadata.create_all(engine)
    print('Tables created.')
    conn = engine.connect()

    # Get tables
    study_table = _Study.__table__
    snp_table   = _SNP.__table__
    pheno_table = _Phenotype.__table__
    pcat_table  = _PhenoCats.__table__
    plat_table  = _Platform.__table__
    pop_table   = _Population.__table__

    # Create insert statements
    study_ins   = study_table.insert()
    snp_ins     = snp_table.insert()
    pheno_ins   = pheno_table.insert()
    pcat_ins    = pcat_table.insert()
    plat_ins    = plat_table.insert()
    pop_ins     = pop_table.insert()
    phsnp_ins   = _snp_pheno_assoc.insert()
    phstudy_ins = _study_pheno_assoc.insert()
    plstudy_ins = _study_plat_assoc.insert()

    # Unique ID counters
    spare_id = 1
    pheno_id = 1
    pcat_id  = 1
    plat_id  = 1
    pop_id   = 1

    # Lists to hold records
    pheno_records   = []
    pcat_records    = []
    plat_records    = []
    pop_records     = []
    study_records   = []
    snp_records     = []
    phsnp_records   = []
    phstudy_records = []
    plstudy_records = []

    # Platform parsing regex
    plat_parser  = _recompile(r'^([^[]*)\[([^]]+)\]?(.*)')

    # Build study information from study file
    print('Parsing study information.')
    with _open_zipped(study_file) as fin:
        # Drop header
        fin.readline()

        if progress:
            pbar = _tqdm(total=2083, unit='studies')
        for line in fin:
            f = line.rstrip().split('\t')

            # Get primary phenotype
            ppheno = _cleanstr(f[7].strip())
            if ppheno not in pphenos:
                pheno_records.append({'phenotype': ppheno,
                                      'id': pheno_id})
                pphenos[ppheno] = pheno_id
                pheno_id += 1

            # Get phenotype categories
            pheno_cats = f[8].strip().split(';')
            our_phenos = []
            for pcat in pheno_cats:
                pcat = pcat.strip()
                if not pcat:
                    continue
                if pcat not in phenos:
                    pcat_records.append({
                        'id':       pcat_id,
                        'category': pcat,
                        'alias':    pheno_synonyms[pcat],
                    })
                    phenos[pcat] = pcat_id
                    pcat_id += 1
                our_phenos.append(phenos[pcat])

            # Get platform info
            our_platforms = []
            try:
                plat, snp_count, impt = [
                    i.strip() for i in plat_parser.findall(f[18].strip())[0]
                ]
                imputed = True if impt == '(imputed)' else False
                plats = _split_mesy_list(plat)
                for plat in plats:
                    plat = plat.strip()
                    if plat not in platforms:
                        plat_records.append({'id':       plat_id,
                                             'platform': plat})
                        platforms[plat] = plat_id
                        plat_id += 1
                    our_platforms.append(platforms[plat])
            except IndexError:
                plat, snp_count, impt = None, None, None
                imputed = None

            # Get population description
            try:
                pop = f[19].strip()
                try:
                    pop = pop_correction[pop]
                except KeyError:
                    pass
                if pop not in populations:
                    pop_records.append({'id':         pop_id,
                                        'population': pop})
                    populations[pop] = pop_id
                    pop_id += 1
                population = populations[pop]
            except IndexError:
                population = None

            # Set populaion flags
            pflag = _PopFlag
            disc_pop = pflag(0)
            rep_pop  = pflag(0)
            l = len(f)
            if l > 22 and f[22]:
                disc_pop |= pflag.eur
            if l > 23 and f[23]:
                disc_pop |= pflag.afr
            if l > 24 and f[24]:
                disc_pop |= pflag.east_asian
            if l > 25 and f[25]:
                disc_pop |= pflag.south_asian
            if l > 26 and f[26]:
                disc_pop |= pflag.his
            if l > 27 and f[27]:
                disc_pop |= pflag.native
            if l > 28 and f[28]:
                disc_pop |= pflag.micro
            if l > 29 and f[29]:
                disc_pop |= pflag.arab
            if l > 30 and f[30]:
                disc_pop |= pflag.mix
            if l > 31 and f[31]:
                disc_pop |= pflag.uns
            if l > 32 and f[32]:
                disc_pop |= pflag.filipino
            if l > 33 and f[33]:
                disc_pop |= pflag.indonesian
            if l > 35 and f[35]:
                rep_pop |= pflag.eur
            if l > 36 and f[36]:
                rep_pop |= pflag.afr
            if l > 37 and f[37]:
                rep_pop |= pflag.east_asian
            if l > 38 and f[38]:
                rep_pop |= pflag.south_asian
            if l > 39 and f[39]:
                rep_pop |= pflag.his
            if l > 40 and f[40]:
                rep_pop |= pflag.native
            if l > 41 and f[41]:
                rep_pop |= pflag.micro
            if l > 42 and f[42]:
                rep_pop |= pflag.arab
            if l > 43 and f[43]:
                rep_pop |= pflag.mix
            if l > 44 and f[44]:
                rep_pop |= pflag.uns
            if l > 45 and f[45]:
                rep_pop |= pflag.filipino
            if l > 46 and f[46]:
                rep_pop |= pflag.indonesian

            # Set the global population flag
            pop_flag = disc_pop | rep_pop

            # Create study
            study_records.append({
                'id':               int(f[0]),
                'author':           _cleanstr(f[1]),
                'pmid':             _cleanstr(f[2]),
                'grasp_ver':        1 if '1.0' in f[3] else 2,
                'noresults':        True if f[4] else False,
                'results':          int(f[5]),
                'qtl':              True if f[6] == '1' else False,
                'phenotype_id':     pphenos[ppheno],
                'phenotype_desc':   ppheno,
                'phenotype':        pphenos[ppheno],
                'phenotype_cats':   our_phenos,
                'datepub':          _get_date(f[9]),
                'in_nhgri':         _get_bool(f[10]),
                'journal':          _cleanstr(f[11]),
                'title':            _cleanstr(f[12]),
                'locations':        _cleanstr(f[13]),
                'mf':               _get_bool(f[14]),
                'mf_only':          _get_bool(f[15]),
                'sample_size':      _cleanstr(f[16]),
                'replication_size': _cleanstr(f[17]),
                'platforms':        platforms,
                'snp_count':        snp_count,
                'imputed':          imputed,
                'population_id':    population,
                'population':       population,
                'total':            int(f[20]),
                'total_disc':       int(f[21]),
                'pop_flag':         int(pop_flag),
                'disc_pop_flag':    int(disc_pop),
                'european':         int(f[22]) if l > 22 and f[22] else None,
                'african':          int(f[23]) if l > 23 and f[23] else None,
                'east_asian':       int(f[24]) if l > 24 and f[24] else None,
                'south_asian':      int(f[25]) if l > 25 and f[25] else None,
                'hispanic':         int(f[26]) if l > 26 and f[26] else None,
                'native':           int(f[27]) if l > 27 and f[27] else None,
                'micronesian':      int(f[28]) if l > 28 and f[28] else None,
                'arab':             int(f[29]) if l > 29 and f[29] else None,
                'mixed':            int(f[30]) if l > 30 and f[30] else None,
                'unspecified':      int(f[31]) if l > 31 and f[31] else None,
                'filipino':         int(f[32]) if l > 32 and f[32] else None,
                'indonesian':       int(f[33]) if l > 33 and f[33] else None,
                'total_rep':        int(f[34]) if l > 34 and f[34] else None,
                'rep_pop_flag':     int(rep_pop),
                'rep_european':     int(f[35]) if l > 35 and f[35] else None,
                'rep_african':      int(f[36]) if l > 36 and f[36] else None,
                'rep_east_asian':   int(f[37]) if l > 37 and f[37] else None,
                'rep_south_asian':  int(f[38]) if l > 38 and f[38] else None,
                'rep_hispanic':     int(f[39]) if l > 39 and f[39] else None,
                'rep_native':       int(f[40]) if l > 40 and f[40] else None,
                'rep_micronesian':  int(f[41]) if l > 41 and f[41] else None,
                'rep_arab':         int(f[42]) if l > 42 and f[42] else None,
                'rep_mixed':        int(f[43]) if l > 43 and f[43] else None,
                'rep_unspecified':  int(f[44]) if l > 44 and f[44] else None,
                'rep_filipino':     int(f[45]) if l > 45 and f[45] else None,
                'rep_indonesian':   int(f[46]) if l > 46 and f[46] else None,
            })

            # Create association records
            for i in our_phenos:
                phstudy_records.append({'study_id':    int(f[0]),
                                        'pheno_id':    i})
            for i in our_platforms:
                plstudy_records.append({'study_id':    int(f[0]),
                                        'platform_id': i})

            pbar.update()

    pbar.close()
    print('Writing study information...')
    conn.execute(pheno_ins, pheno_records)
    conn.execute(pcat_ins, pcat_records)
    conn.execute(plat_ins, plat_records)
    conn.execute(pop_ins, pop_records)
    conn.execute(study_ins, study_records)
    conn.execute(phstudy_ins, phstudy_records)
    conn.execute(plstudy_ins, plstudy_records)
    print('Done')

    # Reinitialize lists for main GRASP parser
    pheno_records   = []
    pcat_records    = []
    plat_records    = []
    pop_records     = []

    # Get full study info from database for use in SNPs
    sinfo = conn.execute(_select([study_table.c.id, study_table.c.pmid])).fetchall()
    studies = {}
    for i, p in sinfo:
        studies[p] = i
    no_pmid = {
        'Dissertation (https://openaccess.leidenuniv.nl/handle/1887/17746)':                                                                          1,
        'KARE Genomewide Association Study of Blood Pressure Using Imputed SNPs':                                                                     2,
        'Genome-wide Association Study Identification of a New Genetic Locus with Susceptibility to Osteoporotic Fracture in the Korean Population.': 3,
        'Genome-wide Association Study Identified TIMP2 Genetic Variant with Susceptibility to Osteoarthritis':                                       4,
        'Application of Structural Equation Models to Genome-wide Association Analysis ':                                                             5,
        'Comparison of Erythrocyte Traits Among European, Japanese and Korean':                                                                       6,
        'Genomewide Association Study Identification of a New Genetic Locus with Susceptibility to Osteoporotic Fracture in the Korean Population':   7,
        'Joint identification of multiple genetic variants of obesity in A Korean Genome-wide association study':                                     8,
        'Genome-Wide Association Analyses on Blood Pressure Using Three Different Phenotype Definitions':                                             9,
        'Association of intronic sequence variant in the gene encoding spleen tyrosine kinase with susceptibility to vascular dementia':              10,
    }

    print('Parsing SNP information...')
    with _open_zipped(grasp_file, encoding='latin1') as fin:
        # Drop header
        fin.readline()

        if progress:
            pbar = _tqdm(total=8864717, unit='snps')

        for line in fin:
            f = line.rstrip().split('\t')

            # Get primary phenotype
            ppheno = _cleanstr(f[11])
            # These are poorly curated, so there is no need to use a
            # separate table for them.
            #  if ppheno not in pphenos:
                #  conn.execute(pheno_ins.values(
                    #  phenotype=ppheno
                #  ))
                #  pphenos[ppheno] = conn.execute(
                    #  select([pheno_table.c.id]).where(
                        #  pheno_table.c.phenotype == ppheno
                    #  )
                #  ).first()[0]

            # Get phenotype categories
            pheno_cats = f[13].strip().split(';')
            our_phenos = []
            for pcat in pheno_cats:
                pcat = pcat.strip()
                if not pcat:
                    continue
                if pcat not in phenos:
                    pcat_records.append({
                        'id':       pcat_id,
                        'category': pcat,
                        'alias':    pheno_synonyms[pcat],
                    })
                    phenos[pcat] = pcat_id
                    pcat_id += 1
                our_phenos.append(phenos[pcat])

            # Get population description
            try:
                pop = f[23].strip()
                try:
                    pop = pop_correction[pop]
                except KeyError:
                    pass
                if pop not in populations:
                    pop_records.append({'id':         pop_id,
                                        'population': pop})
                    populations[pop] = pop_id
                    pop_id += 1
                population = populations[pop]
            except IndexError:
                population = None

            # Create record for SNP
            try:
                sid       = int(f[0])
            except ValueError:
                sid       = spare_id
                spare_id += 1
            l = len(f)
            try:
                study = studies[f[7].strip()]
            except KeyError:
                study = no_pmid[f[17].strip()]
            record = {
                'id':               sid,
                'NHLBIkey':         f[0],
                'HUPfield':         f[1],
                'LastCurationDate': _get_date(f[2]),
                'CreationDate':     _get_date(f[3]),
                'snpid':            f[4],
                'chrom':            f[5],
                'pos':              int(f[6]),
                'population_id':    population,
                'population':       population,
                'study_id':         study,
                'study':            study,
                'study_snpid':      f[8],
                'paper_loc':        f[9],
                'pval':             float(f[10]) if f[10] else None,
                'phenotype_desc':   ppheno,
                'phenotype_cats':   our_phenos,
            }
            record['InGene']             = f[51] if l > 52 else None
            record['NearestGene']        = f[52] if l > 53 else None
            record['InLincRNA']          = f[53] if l > 54 else None
            record['InMiRNA']            = f[54] if l > 55 else None
            record['InMiRNABS']          = f[55] if l > 56 else None
            record['dbSNPfxn']           = f[56] if l > 57 else None
            record['dbSNPMAF']           = f[57] if l > 58 else None
            record['dbSNPinfo']          = f[58] if l > 59 else None
            record['dbSNPvalidation']    = f[59] if l > 60 else None
            record['dbSNPClinStatus']    = f[60] if l > 61 else None
            record['ORegAnno']           = f[61] if l > 62 else None
            record['ConservPredTFBS']    = f[62] if l > 63 else None
            record['HumanEnhancer']      = f[63] if l > 64 else None
            record['RNAedit']            = f[64] if l > 65 else None
            record['PolyPhen2']          = f[65] if l > 66 else None
            record['SIFT']               = f[66] if l > 67 else None
            record['LSSNP']              = f[67] if l > 68 else None
            record['UniProt']            = f[68] if l > 69 else None
            record['EqtlMethMetabStudy'] = f[69] if l > 70 else None
            snp_records.append(record)

            # Create association records
            for i in our_phenos:
                phsnp_records.append({'snp_id' : sid, 'pheno_id' : i})

            # Decide when to execute
            if count:
                count -= 1
            else:
                if progress:
                    pbar.write('Writing rows...')
                else:
                    print('Writing rows...')
                if pcat_records:
                    conn.execute(pcat_ins, pcat_records)
                if plat_records:
                    conn.execute(plat_ins, plat_records)
                if pop_records:
                    conn.execute(pop_ins, pop_records)
                conn.execute(snp_ins, snp_records)
                conn.execute(phsnp_ins, phsnp_records)
                if progress:
                    pbar.write('{} rows written'.format(rows))
                else:
                    print('{} rows written'.format(rows))
                count         = commit_every-1
                pcat_records  = []
                plat_records  = []
                pop_records   = []
                snp_records   = []
                phsnp_records = []
            rows += 1
            if progress:
                pbar.update()

        # Final insert
        pbar.close()
        print('Writing final rows...')
        conn.execute(snp_ins, snp_records)
        conn.execute(phsnp_ins, phsnp_records)
        print('{} rows written'.format(rows))
        print('Done!')


###############################################################################
#                              Support Functions                              #
###############################################################################


_rmspace  = _recompile(r'  +')
def _cleanstr(string):
    """Run a few regex cleanups on a string.

    Strips unneeded starting characters, trims extra spaces, and converts
    mangeled unicode characters from Excel into readable characters.
    """
    cstr = _rmspace.sub(' ', string.strip(" \"'"))
    return _unescape(cstr)


def _split_mesy_list(string):
    """Split a string that contains commas and 'ands/&'

    :string:  A string with commas/ands/&
    :returns: A list

    """
    init_list  = [i.strip() for i in string.split(',') if i]
    final_list = []
    for i in init_list:
        if i.isspace():
            continue
        andlist = i.split('and')
        amplist = i.split('&')
        if len(andlist) > 1:
            for j in andlist:
                if not j or j.isspace():
                    continue
                final_list.append(j.strip())
        elif len(amplist) > 1:
            for j in amplist:
                if not j or j.isspace():
                    continue
                final_list.append(j.strip())
        else:
            final_list.append(i.strip())
    final_list = [i.strip() for i in final_list if not i.isspace()]
    return [i for i in final_list if i]


def _get_date(string):
    """Return datetime date object from string."""
    try:
        return _date.fromordinal(_dateparse(string).toordinal())
    except ValueError:
        print(string)
        raise


def _get_bool(string):
    """Return a bool from a string of y/n."""
    string = string.lower()
    return True if string == 'y' else False


def _open_zipped(infile, mode='r', encoding='utf-8'):
    """ Return file handle of file regardless of zipped or not
        Text mode enforced for compatibility with python2 """
    mode   = mode[0] + 't'
    p2mode = mode
    if hasattr(infile, 'write'):
        return infile
    if isinstance(infile, str):
        if infile.endswith('.gz'):
            return _zopen(infile, mode)
        if infile.endswith('.bz2'):
            return _bopen(infile, mode)
        return open(infile, p2mode, encoding=encoding)