Convert UCSC genpred/knowngene file to gff3 or gtf
This program is now part of the main jvarkit
tool. See jvarkit for compiling.
Usage: java -jar dist/jvarkit.jar kg2gff [options] Files
Usage: kg2gff [options] Files
Options:
--coding
select coding transcript only.
Default: false
--gtf
use the GTF writer instead of GFF3
Default: false
-h, --help
print help and exit
--helpFormat
What kind of help. One of [usage,markdown,xml].
--modulo3
discard transcripts where CDS length isn't a modulo 3 (eg. remove
xeno-transcript mapped on another build)
Default: false
-o, --output
Output file. Optional . Default: stdout
--score
default score (-1: undefined)
Default: -1
--source
label for column 'source' in the output
Default: ucsc
--version
print version and exit
20210106
The project is licensed under the MIT license.
Should you cite kg2gff ? https://github.com/mr-c/shouldacite/blob/master/should-I-cite-this-software.md
The current reference is:
http://dx.doi.org/10.6084/m9.figshare.1425030
Lindenbaum, Pierre (2015): JVarkit: java-based utilities for Bioinformatics. figshare. http://dx.doi.org/10.6084/m9.figshare.1425030
ouput is escaped for UTF8, some characters like ‘:’ might be converted to a hexadecimal encoding.
headjeter.kg -n1 | java -jar dist/jvarkit.jar kg2gff
##gff-version 3.1.25
chr1 ucsc gene 55039456 55064852 . + . ID=gene%3APCSK9;Name=PCSK9;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;gene_type=protein_coding
chr1 ucsc mRNA 55039456 55064852 . + . ID=ENST00000710286.1;Parent=gene%3APCSK9;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc exon 55039456 55040044 . + . ID=ENST00000710286.1%3AE0;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE0
chr1 ucsc exon 55043843 55044034 . + . ID=ENST00000710286.1%3AE1;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE1
chr1 ucsc exon 55046523 55046646 . + . ID=ENST00000710286.1%3AE2;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE2
chr1 ucsc exon 55052278 55052411 . + . ID=ENST00000710286.1%3AE3;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE3
chr1 ucsc exon 55052650 55052791 . + . ID=ENST00000710286.1%3AE4;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE4
chr1 ucsc exon 55055993 55056189 . + . ID=ENST00000710286.1%3AE5;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE5
chr1 ucsc exon 55057331 55057514 . + . ID=ENST00000710286.1%3AE6;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE6
chr1 ucsc exon 55058036 55058209 . + . ID=ENST00000710286.1%3AE7;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE7
chr1 ucsc exon 55058499 55058647 . + . ID=ENST00000710286.1%3AE8;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE8
chr1 ucsc exon 55059486 55059663 . + . ID=ENST00000710286.1%3AE9;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE9
chr1 ucsc exon 55061375 55061556 . + . ID=ENST00000710286.1%3AE10;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE10
chr1 ucsc exon 55063369 55064852 . + . ID=ENST00000710286.1%3AE11;Parent=ENST00000710286.1;Name=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1;exon_id=ENST00000710286.1%3AE11
chr1 ucsc CDS 55039481 55040044 . + 0 ID=ENST00000710286.1%3AC1;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55043843 55044034 . + 0 ID=ENST00000710286.1%3AC2;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55046523 55046646 . + 0 ID=ENST00000710286.1%3AC3;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55052278 55052411 . + 2 ID=ENST00000710286.1%3AC4;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55052650 55052791 . + 0 ID=ENST00000710286.1%3AC5;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55055993 55056189 . + 2 ID=ENST00000710286.1%3AC6;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55057331 55057514 . + 0 ID=ENST00000710286.1%3AC7;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55058036 55058209 . + 2 ID=ENST00000710286.1%3AC8;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55058499 55058647 . + 2 ID=ENST00000710286.1%3AC9;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55059486 55059663 . + 0 ID=ENST00000710286.1%3AC10;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55061375 55061556 . + 2 ID=ENST00000710286.1%3AC11;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc CDS 55063369 55063584 . + 0 ID=ENST00000710286.1%3AC12;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc 5_prime_UTR 55039456 55039480 . + . ID=ENST00000710286.1%3AU55039456;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1
chr1 ucsc 3_prime_UTR 55063585 55064852 . + . ID=ENST00000710286.1%3AU55063585;Parent=ENST00000710286.1;biotype=protein_coding;gene_id=gene%3APCSK9;gene_name=PCSK9;transcript_id=ENST00000710286.1