WGS Simulation of DUF1220 Regions

From BCOeditor Wiki
Revision as of 21:03, 5 February 2026 by Lorikrammer (talk | contribs)
Jump to navigation Jump to search
 
{
    "object_id": "https://biocomputeobject.org/BCO_000545/1.0",
    "spec_version": "https://w3id.org/ieee/ieee-2791-schema/2791object.json",
    "provenance_domain": {
      "name": "WGS Simulation of DUF1220 Regions",
      "version": "4.0",
      "license": "https://opensource.org/licenses/MIT",
      "created": "2020-08-30T11:30:52.000Z",
      "modified": "2024-11-17T14:18:09.680Z",
      "derived_from": "https://biocomputeobject.org/viewer?https://biocomputeobject.org/BCO_000277/4.0",
      "contributors": [
        {
          "name": "David Astling",
          "orcid": "https://orcid.org/0000-0001-8179-0304",
          "affiliation": "University of Colorado",
          "contribution": [
            "authoredBy"
          ],
          "email": "david.astling@example.com"
        },
        {
          "name": "Ilea Heft",
          "orcid": "https://orcid.org/0000-0002-7759-7007",
          "affiliation": "University of Colorado",
          "contribution": [
            "authoredBy"
          ],
          "email": "ilea.heft@example.com"
        },
        {
          "name": "Kenneth Jones",
          "affiliation": "University of Colorado",
          "contribution": [
            "authoredBy"
          ],
          "email": "kenneth.jones@example.com"
        },
        {
          "name": "James Sikela",
          "orcid": "https://orcid.org/0000-0001-5820-2762",
          "affiliation": "University of Colorado",
          "contribution": [
            "authoredBy"
          ],
          "email": "james.sikela@example.com"
        },
        {
          "name": "Jonathon Keeney",
          "affiliation": "George Washington University",
          "contribution": [
            "createdBy"
          ],
          "email": "keeneyjg@gwu.edu"
        },
        {
          "name": "Alex Nguyen",
          "affiliation": "UVA",
          "contribution": [
            "createdBy"
          ],
          "email": "tan5um@example.com"
        }
      ],
      "review": [
        {
          "reviewer": {
            "name": "Mike Taylor",
            "orcid": "https://orcid.org/0000-0002-1003-5675",
            "affiliation": "GWU",
            "contribution": [
              "createdBy"
            ],
            "email": "mtaylor@example.edu"
          },
          "status": "unreviewed",
          "date": "2024-11-17T17:00:00.000Z",
          "reviewer_comment": "QC passed. Well described workflow."
        }
      ]
    },
    "usability_domain": [
      "Olduvai protein domains (formerly \"DUF1220\") are the most duplicated protein coding sequence in the human genome (https://doi.org/10.1371/journal.pbio.0020207). They are expressed in many tissues, including strongly in the brain (https://doi.org/10.1126/science.1127980). The copy number of Olduvai domains has been linked to increased brain size (https://doi.org/10.1007/s00429-014-0814-9), and performance on IQ tests (https://doi.org/10.1007/s00439-014-1489-2), as well as neurodiverse states like autism (https://doi.org/10.1371/journal.pgen.1004241).",
      "Precise evaluation of copy number in humans has been difficult to achieve, as ratiometric approaches fail to identify small changes when the total number is in the ~300 range, as in humans.  Read depth approaches using short read WGS data are promising, but most existing pipelines mask repeats altogether (potentially including these sequences), and those that do measure copy number do it relative to the gene. However, Olduvai domains are known to exist in different number and kind within a family of genes, rendering this approach inoperative. The pipeline described here is used to identify the copy number of genetic sequences independent of the genes in which they occur, and with higher fidelity than existing methods, designed specifically with Olduvai copy number in mind.",
      "Approximately 25 individuals were randomly chosen from each of the American (Utah -- Northern and Western European ancestry; CEU), Nigerian (Yoruba; YRI), Han Chinese (Beijing; CHB), Japanese (Tokyo; JPT), Mexican-American (Los Angeles; MXL), Colombian (Medellin; CLM), Puerto Rican (Puerto Rico; PUR), African-American (Southwest US; ASW), Luhya (Webuye, Kenya; LWK), Han Chinese (South China; CHS), Tuscan (Toscana, Italia; TSI), Spanish (Iberian populations; IBS), Finnish (Finland; FIN), and BGR populations for a total of 324 individuals. Where domains were more than 1 kb apart, the boundaries of the domains were extended up to 250 bp to allow the possibility of capturing unique sequence directly adjacent to the domain. No intermediate files were generated because the commands were run executed as a pipe at the command line, so uuids were used for the file IOs in the Description Domain. Relative file paths refer to the local machine on which this pipeline was run. This pipeline was created based on the work of Astling et al. doi: 10.1186/s12864-017-3976-z"
    ],
    "description_domain": {
      "keywords": [
        "Genome",
        "Genomics",
        "Simulation",
        "Alignment",
        "Alignment-Strategies",
        "copy-number-variation",
        "cnv",
        "DUF1220",
        "Genome informatics",
        "Next-generation sequencing",
        "Bioinformatics"
      ],
      "platform": [
        "Centos 8"
      ],
      "pipeline_steps": [
        {
          "name": "Spike-In Simulate Reads",
          "description": "The 1_simulate_readlengths.sh script simulates the reads for the 'spike-in' simulation. Single and paired-reads can be randomly sampled from the reference genome (hg38.) To simulate duplication or deletion events, the number of reads were varied to simulate one to ten copies of each Olduvai domain.",
          "version": "1.0.0",
          "step_number": 1,
          "input_list": [
            {
              "uri": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/est.fa.gz",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "output_list": [
            {
              "uri": "urn:uuid:8a8b4277-0d0a-4940-9fb4-cd703c98ce83",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "prerequisite": [
            {
              "name": "1_simulate_readlengths.sh",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/simulation_spikein/1_simulate_readlengths.sh",
                "filename": "1_simulate_readlengths.sh"
              }
            }
          ]
        },
        {
          "name": "Cutadapt",
          "version": "1.3.1",
          "step_number": 2,
          "input_list": [
            {
              "uri": "urn:uuid:8a8b4277-0d0a-4940-9fb4-cd703c98ce83",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "output_list": [
            {
              "uri": "urn:uuid:4924e1bc-2406-4d3b-8ba3-dc16cb7191fe",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "description": "This tool trims reads to remove unwanted sequences (low quality bases in this case); threshold was set at Phred score < 10."
        },
        {
          "name": "Spike-In Trim and Filter Reads",
          "version": "1.0.0",
          "step_number": 3,
          "input_list": [
            {
              "uri": "urn:uuid:4924e1bc-2406-4d3b-8ba3-dc16cb7191fe",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "output_list": [
            {
              "uri": "urn:uuid:7b8b984a-6ab6-4a8c-a9f9-7c2b85c67cc8",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "prerequisite": [
            {
              "name": "2_trim_spikein.sh",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/simulation_spikein/2_trim_spikein.sh",
                "filename": "2_trim_spikein.sh"
              }
            }
          ],
          "description": "This script filters and trims reads down to 100 bp to remove low quality bases from the ends."
        },
        {
          "name": "Spike-In Align Reads to the Genome Reference",
          "version": "1.0.0",
          "step_number": 4,
          "input_list": [
            {
              "uri": "urn:uuid:7b8b984a-6ab6-4a8c-a9f9-7c2b85c67cc8",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "output_list": [
            {
              "uri": "urn:uuid:f6125de4-5a9e-4d31-8c84-24793b803e29",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "prerequisite": [
            {
              "name": "bowtie2_spikein.sh",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/simulation_spikein/bowtie2_spikein.sh",
                "filename": "bowtie2_spikein.sh"
              }
            }
          ],
          "description": "Alignment of reads to a reference genome. Bowtie2 is an alignment tool that is particularly strong with short read data aligned against large (i.e. mammalian) genomes. This workflow used the 'best' alignment for each read."
        },
        {
          "name": "Calculate Spike-In Coverage for each DUF1220 domain",
          "version": "1.0.0",
          "step_number": 5,
          "input_list": [
            {
              "uri": "urn:uuid:f6125de4-5a9e-4d31-8c84-24793b803e29",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "output_list": [
            {
              "uri": "urn:uuid:c9497544-2102-432d-baff-b44af86a4926",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "prerequisite": [
            {
              "name": "make_bed_spikein.sh",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/simulation_spikein/make_bed_spikein.sh",
                "filename": "make_bed_spikein.sh"
              }
            }
          ],
          "description": "This script calculates depth of coverage coverage for each DUF1220 domain (number of times each nucleotide was counted and domains were averaged)."
        },
        {
          "name": "Baseline Simulate Reads",
          "version": "1.0.0",
          "step_number": 6,
          "output_list": [
            {
              "uri": "urn:uuid:c9497544-2102-432d-baff-b44af86a4926",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "prerequisite": [
            {
              "name": "Human genome build 38",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa",
                "filename": "GRCh38_full_analysis_set_plus_decoy_hla.fa"
              }
            }
          ],
          "description": "This script simulates the reads for the baseline simulation. Single and paired-reads were randomly sampled from the reference genome (hg38). To simulate duplication or deletion events, the number of reads were varied to simulate one to ten copies of each DUF1220 domain. To obtain reads for a single domain, reads overlapping a DUF1220 domain of interest were isolated and aligned back to the genome using each of the alignment strategies below.",
          "input_list": [
            {
              "uri": "urn:uuid:c9497544-2102-432d-baff-b44af86a4926"
            }
          ]
        },
        {
          "name": "Baseline Trim and Filter Reads",
          "version": "1.0.0",
          "step_number": 7,
          "input_list": [
            {
              "uri": "urn:uuid:b8a069a0-1dfe-4f3e-96dc-e8b911f49928",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "output_list": [
            {
              "uri": "urn:uuid:86b1e34f-54b9-4aef-aa1d-a25dfb57ce4e",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "prerequisite": [
            {
              "name": "2_trim_replicates.sh",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/simulation_baseline/2_trim_replicates.sh",
                "filename": "2_trim_replicates.sh"
              }
            }
          ],
          "description": "This script filters and trims reads down to 100 bp to remove low quality bases from the ends."
        },
        {
          "name": "Baseline Align Reads to the Genome Reference",
          "version": "1.0.0",
          "step_number": 8,
          "input_list": [
            {
              "uri": "urn:uuid:86b1e34f-54b9-4aef-aa1d-a25dfb57ce4e",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "output_list": [
            {
              "uri": "urn:uuid:48cc6aa9-f00a-4ed1-bc65-40745400ee7d",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "prerequisite": [
            {
              "name": "bowtie2_replicates.sh",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/simulation_baseline/bowtie2_replicates.sh",
                "filename": "bowtie2_replicates.sh"
              }
            }
          ],
          "description": "Alignment of reads to a reference genome. Again, this workflow used the 'best' alignment for each read."
        },
        {
          "name": "Calculate Baseline Coverage for each DUF1220 Domain",
          "version": "1.0.0",
          "step_number": 9,
          "input_list": [
            {
              "uri": "urn:uuid:48cc6aa9-f00a-4ed1-bc65-40745400ee7d",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "output_list": [
            {
              "uri": "urn:uuid:dcf54233-0cd9-4117-845b-186248278cf2",
              "access_time": "2019-08-12T19:11:18.369303"
            }
          ],
          "prerequisite": [
            {
              "name": "make_bed_replicates.sh",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/simulation_baseline/make_bed_replicates.sh",
                "filename": "make_bed_replicates.sh"
              }
            },
            {
              "name": "Ultra Conserved Elements",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "http://ucbase.unimore.it/"
              }
            },
            {
              "name": "Database of Genomic Variants",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "http://dgv.tcag.ca/"
              }
            }
          ],
          "description": "This script calculates depth of coverage for each DUF1220 domain (number of times each base was sequenced and averaging each domain)."
        },
        {
          "name": "Result Analysis",
          "version": "1.0.0",
          "step_number": 10,
          "prerequisite": [
            {
              "name": "spikein_analysis.R",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/analysis/spikein_analysis.R",
                "filename": "spikein_analysis.R"
              }
            },
            {
              "name": "replicate_analysis.R",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/analysis/replicate_analysis.R",
                "filename": "replicate_analysis.R"
              }
            },
            {
              "name": "load_data.R",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/analysis/load_data.R",
                "filename": "load_data.R"
              }
            },
            {
              "name": "normalize.R",
              "uri": {
                "access_time": "2019-08-12T19:11:18.369303",
                "uri": "https://github.com/dpastling/DUF1220_simulation/blob/master/code/analysis/normalize.R",
                "filename": "normalize.R"
              }
            }
          ],
          "description": "These scripts compile, parse, and analyze the results files in R. spikein_analysis.R processes the result files for the spikein simulation. replicate_analysis.R processes the result files for the baseline simulation. load_data.R gathers all of the result files into a single data frame. normalize.R normalizes the data depending which type of analysis was run.",
          "input_list": [
            {
              "uri": "https://github.com/kee007ney/hello-world/blob/master/sampleDataIn.txt"
            }
          ],
          "output_list": [
            {
              "uri": "http://localhost:8080/home/keeneyjg/brainVolume/results/outFile.tsv"
            }
          ]
        }
      ]
    },
    "execution_domain": {
      "script": [
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/1_simulate_readlengths.sh",
            "filename": "1_simulate_readlengths.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/2_trim_spikein.sh",
            "filename": "2_trim_spikein.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/bowtie2_spikein.sh",
            "filename": "bowtie2_spikein.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/bowtie_multiread_spikein.sh",
            "filename": "bowtie_multiread_spikein.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/mrsfast_spikein.sh",
            "filename": "mrsfast_spikein.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/bowtie_total_spikein.sh",
            "filename": "bowtie_total_spikein.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/make_bed_spikein.sh",
            "filename": "make_bed_spikein.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/1_simulate_replicates.sh",
            "filename": "1_simulate_replicates.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/2_trim_replicates.sh",
            "filename": "2_trim_replicates.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/bowtie2_replicates.sh",
            "filename": "bowtie2_replicates.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/bowtie_multiread_replicates.sh",
            "filename": "bowtie_multiread_replicates.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/mrsfast_replicates.sh",
            "filename": "mrsfast_replicates.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/bowtie_total_replicates.sh",
            "filename": "bowtie_total_replicates.sh"
          }
        },
        {
          "uri": {
            "uri": "https://github.com/dpastling/DUF1220_simulation/tree/master/code/simulation_spikein/make_bed_replicates.sh",
            "filename": "make_bed_replicates.sh"
          }
        }
      ],
      "script_driver": "shell",
      "software_prerequisites": [
        {
          "name": "Bowtie2",
          "version": "2.2.5",
          "uri": {
            "access_time": "2019-08-12T19:11:18.369303",
            "uri": "http://bowtie-bio.sourceforge.net/bowtie2/index.shtml"
          }
        },
        {
          "name": "Bedtools",
          "version": "2.17.0",
          "uri": {
            "access_time": "2019-08-12T19:11:18.369303",
            "uri": "https://bedtools.readthedocs.io/en/latest/"
          }
        },
        {
          "name": "Samtools",
          "version": "0.1.19-44428cd",
          "uri": {
            "access_time": "2019-08-12T19:11:18.369303",
            "uri": "http://samtools.sourceforge.net/",
            "filename": "https://sourceforge.net/projects/samtools/files/samtools/0.1.19/samtools-0.1.19.tar.bz2/download"
          }
        },
        {
          "name": "mrsFAST-Ultra",
          "version": "3.3.11",
          "uri": {
            "access_time": "2019-08-12T19:11:18.369303",
            "uri": "http://sfu-compbio.github.io/mrsfast/",
            "filename": "https://github.com/sfu-compbio/mrsfast/zipball/master"
          }
        },
        {
          "name": "Bowtie1",
          "version": "1.1.2",
          "uri": {
            "access_time": "2019-08-12T19:11:18.369303",
            "uri": "http://bowtie-bio.sourceforge.net/index.shtml",
            "filename": "https://sourceforge.net/projects/bowtie-bio/files/bowtie/1.1.2/bowtie-1.1.2-src.zip/download"
          }
        },
        {
          "name": "Perl module: Math::Random",
          "version": "0.72",
          "uri": {
            "access_time": "2019-08-12T19:11:18.369303",
            "uri": "https://cpan.metacpan.org/authors/id/G/GR/GROMMEL/Math-Random-0.72.tar.gz",
            "filename": "Math-Random-0.72.tar.gz"
          }
        },
        {
          "name": "Perl module: Math::Complex",
          "version": "1.59",
          "uri": {
            "access_time": "2019-08-12T19:11:18.369303",
            "uri": "http://ftp.altlinux.org/pub/distributions/ALTLinux/Sisyphus/noarch/RPMS.classic//perl-Math-Complex-1.59-alt1.noarch.rpm",
            "filename": "perl-Math-Complex-1.59-alt1.noarch.rpm"
          }
        }
      ],
      "external_data_endpoints": [
        {
          "name": "to e-utils",
          "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        }
      ],
      "environment_variables": {
        "key": "EDITOR",
        "value": "vim"
      }
    },
    "io_domain": {
      "input_subdomain": [
        {
          "uri": {
            "uri": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/est.fa.gz"
          }
        }
      ],
      "output_subdomain": [
        {
          "uri": {
            "uri": "https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-017-3976-z/tables/1"
          },
          "mediatype": "text/csv"
        },
        {
          "uri": {
            "uri": "https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-017-3976-z/tables/2"
          },
          "mediatype": "text/csv"
        },
        {
          "uri": {
            "uri": "http://localhost:8080/home/keeneyjg/brainVolume/results/outFile.tsv"
          },
          "mediatype": "text/tsv"
        }
      ]
    },
    "parametric_domain": [
      {
        "step": "1",
        "param": "command line args",
        "value": "cutadapt -a XXX -A XXX -q 10 -minimum-length 80 -trim-n"
      },
      {
        "step": "7",
        "param": "command line arg",
        "value": "--very-sensitive"
      },
      {
        "step": "7",
        "param": "max insert size",
        "value": "800"
      },
      {
        "step": "8",
        "param": "alignment type",
        "value": "best alignment"
      }
    ],
    "error_domain": {
      "empirical_error": {
        "Root mean square error by domain for Best Alignment": {
          "CON1": 1.55,
          "CON2": 0.91,
          "CON3": 0.26,
          "HLS1": 0.99,
          "HLS2": 1.9,
          "HLS3": 1.67
        }
      },
      "algorithmic_error": {}
    },
    "etag": "73bb5452832feba8c462db900ce029a64de0d25ddf619c61133a8a6fd08e08d4"
  }