Skip to content

Commit

Permalink
Upgrade to new (backwards-compatible) picovcf version
Browse files Browse the repository at this point in the history
This adds support for IGD v4, which is slightly smaller than IGD v3.
  • Loading branch information
dcdehaas committed Aug 28, 2024
1 parent 6e5ec34 commit aad0319
Showing 1 changed file with 130 additions and 47 deletions.
177 changes: 130 additions & 47 deletions third-party/picovcf/picovcf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,22 @@ static inline T readScalar(std::istream& inStream) {
return simpleValue;
}

static inline size_t readStringLen(const uint64_t version, std::istream& inStream) {
if (version == 3) {
return readScalar<uint64_t>(inStream);
}
return readScalar<uint32_t>(inStream);
}

static inline std::string readString(const uint64_t version, std::istream& inStream) {
const auto strLength = readStringLen(version, inStream);
PICOVCF_GOOD_OR_MALFORMED_FILE(inStream);
std::string strValue;
strValue.resize(strLength);
inStream.read(const_cast<char*>(strValue.c_str()), strLength);
return std::move(strValue);
}

/** Vector of sample indexes (IDs) */
using IGDSampleList = std::vector<SampleT>;

Expand Down Expand Up @@ -1272,10 +1288,10 @@ class IGDData {
/** Flag that indicates the genotype data is phased */
static constexpr uint64_t IGD_PHASED = 0x1;
/** The IGD file format version that this library writes */
static constexpr uint64_t CURRENT_IGD_VERSION = 3;
static constexpr uint64_t CURRENT_IGD_VERSION = 4;
/** If fewer than NumSamples/DEFAULT_SPARSE_THRESHOLD samples have a particular variant, we will
store it sparsely. */
static constexpr uint32_t DEFAULT_SPARSE_THRESHOLD = 128;
static constexpr uint32_t DEFAULT_SPARSE_THRESHOLD = 32;

/* See IGD.FORMAT.md for detailed file format. The layout is:
* - FixedHeader (128 bytes)
Expand Down Expand Up @@ -1307,7 +1323,8 @@ class IGDData {
uint64_t filePosIndex; // Byte offset in the file where the variant rows start.
uint64_t filePosVariants; // Byte offset in the file where the variant allele information start.
uint64_t filePosIndividualIds; // Byte offset in the file where the identifiers for individuals start.
uint64_t unused[7];
uint64_t filePosVariantIds; // Byte offset in the file where the identifiers for variants start.
uint64_t unused[6];
};
static_assert(sizeof(FixedHeader) == 128, "Header size is fixed");

Expand Down Expand Up @@ -1380,6 +1397,14 @@ class IGDData {
return m_source;
}

/**
* Free-form description of the file contents.
* @return A string description.
*/
std::string getDescription() const {
return m_description;
}

/**
* The ploidy of _all_ the data contained in the file.
* @return ploidy value >= 1.
Expand Down Expand Up @@ -1504,7 +1529,10 @@ class IGDData {
}

/**
* Read the (optional) list of label identifiers from the file.
* Read the (optional) list of individual identifiers from the file.
*
* @return A newly created std::vector<std::string> object. Individual 0 has its label at
* position 0, and the last individual has its label at position (numIndividuals()-1).
*/
std::vector<std::string> getIndividualIds() {
std::vector<std::string> result;
Expand All @@ -1513,15 +1541,31 @@ class IGDData {
const size_t numLabels = readScalar<uint64_t>(m_infile);
PICOVCF_ASSERT_OR_MALFORMED(numLabels == m_header.numIndividuals, "Invalid number of individual ids");
for (size_t i = 0; i < numLabels; i++) {
const size_t idLength = readScalar<uint64_t>(m_infile);
std::string idString;
idString.resize(idLength);
m_infile.read(const_cast<char*>(idString.c_str()), idLength);
result.emplace_back(std::move(idString));
result.emplace_back(std::move(readString(m_header.version, m_infile)));
}
}
return result;
return std::move(result);
}

/**
* Read the (optional) list of variant identifiers from the file.
*
* @return A newly created std::vector<std::string> object. Variant 0 has its label at
* position 0, and the last variant has its identifier at position (numVariants()-1).
*/
std::vector<std::string> getVariantIds() {
std::vector<std::string> result;
if (0 != m_header.filePosVariantIds) {
m_infile.seekg(m_header.filePosVariantIds);
const size_t numLabels = readScalar<uint64_t>(m_infile);
PICOVCF_ASSERT_OR_MALFORMED(numLabels == m_header.numVariants, "Invalid number of variant ids");
for (size_t i = 0; i < numLabels; i++) {
result.emplace_back(std::move(readString(m_header.version, m_infile)));
}
}
return std::move(result);
}

private:
size_t getVariantIndexOffset(VariantT variantIndex) {
return m_header.filePosIndex + (variantIndex * sizeof(IndexEntry));
Expand All @@ -1532,13 +1576,12 @@ class IGDData {
PICOVCF_GOOD_OR_MALFORMED_FILE(m_infile);
PICOVCF_ASSERT_OR_MALFORMED(m_header.magic == IGD_MAGIC,
"Invalid magic " << std::hex << m_header.magic);
PICOVCF_ASSERT_OR_MALFORMED(m_header.version == CURRENT_IGD_VERSION,
static_assert(CURRENT_IGD_VERSION == 4, "Remove/check backwards compatibility if incrementing version");
PICOVCF_ASSERT_OR_MALFORMED(m_header.version == CURRENT_IGD_VERSION
|| m_header.version == 3,
"Unsupported file format version " << m_header.version);
const size_t sourceLen = readScalar<uint64_t>(m_infile);
m_source.resize(sourceLen);
m_infile.read(const_cast<char*>(m_source.c_str()), sourceLen);
const size_t descriptionLen = readScalar<uint64_t>(m_infile);
m_infile.read(const_cast<char*>(m_description.c_str()), descriptionLen);
m_source = readString(m_header.version, m_infile);
m_description = readString(m_header.version, m_infile);
m_beforeFirstVariant = m_infile.tellg();
PICOVCF_ASSERT_OR_MALFORMED(m_header.filePosVariants > m_beforeFirstVariant,
"Invalid variant info position " << m_header.filePosVariants);
Expand All @@ -1557,23 +1600,15 @@ class IGDData {
m_referenceAlleles.reserve(m_header.numVariants);
m_alternateAlleles.reserve(m_header.numVariants);
for (VariantT i = 0; i < m_header.numVariants; i++) {
std::string reference;
const size_t refLen = readScalar<uint64_t>(m_infile);
PICOVCF_GOOD_OR_MALFORMED_FILE(m_infile);
reference.resize(refLen);
m_infile.read(const_cast<char*>(reference.c_str()), refLen);
std::string reference = readString(m_header.version, m_infile);
PICOVCF_GOOD_OR_MALFORMED_FILE(m_infile);
IGDAllele refAllele(reference, m_longAlleles.size());
if (refAllele.isLong()) {
m_longAlleles.push_back(std::move(reference));
}
m_referenceAlleles.push_back(std::move(refAllele));

std::string alternate;
const size_t altLen = readScalar<uint64_t>(m_infile);
PICOVCF_GOOD_OR_MALFORMED_FILE(m_infile);
alternate.resize(altLen);
m_infile.read(const_cast<char*>(alternate.c_str()), altLen);
std::string alternate = readString(m_header.version, m_infile);
PICOVCF_GOOD_OR_MALFORMED_FILE(m_infile);
IGDAllele altAllele(alternate, m_longAlleles.size());
if (altAllele.isLong()) {
Expand Down Expand Up @@ -1603,6 +1638,11 @@ static inline void writeScalar(T intValue, std::ostream& outStream) {
outStream.write(reinterpret_cast<const char*>(&intValue), sizeof(intValue));
}

static inline void writeString(const std::string& value, std::ostream& outStream) {
writeScalar<uint32_t>(value.size(), outStream);
outStream.write(value.c_str(), value.size());
}

inline void writeAllelesAsOnes(std::ostream& outStream,
const IGDSampleList& orderedSampleList,
SampleT numSamples) {
Expand Down Expand Up @@ -1678,10 +1718,8 @@ class IGDWriter {
const std::string& source,
const std::string& description) {
outStream.write(reinterpret_cast<const char*>(&m_header), sizeof(m_header));
writeScalar<uint64_t>(source.size(), outStream);
outStream.write(source.c_str(), source.size());
writeScalar<uint64_t>(description.size(), outStream);
outStream.write(description.c_str(), description.size());
writeString(source, outStream);
writeString(description, outStream);
}

/**
Expand Down Expand Up @@ -1724,7 +1762,7 @@ class IGDWriter {

/**
* Write the table of information about the variants. This information is collected and saved
* by writeSamplesRow(), so this function must be called _after_ that one.
* by writeVariantSamples(), so this function must be called _after_ that one.
* @param[in] outStream The output stream.
*/
void writeIndex(std::ostream& outStream) {
Expand All @@ -1736,27 +1774,25 @@ class IGDWriter {

/**
* Write the table of information about the variants. This information is collected and saved
* by writeSamplesRow(), so this function must be called _after_ that one.
* by writeVariantSamples(), so this function must be called _after_ that one.
* @param[in] outStream The output stream.
*/
void writeVariantInfo(std::ostream& outStream) {
m_header.filePosVariants = outStream.tellp();
for (VariantT i = 0; i < m_header.numVariants; i++) {
const std::string& ref = m_referenceAlleles.at(i);
writeScalar<uint64_t>(ref.size(), outStream);
outStream.write(ref.c_str(), ref.size());
writeString(ref, outStream);

const std::string& alt = m_alternateAlleles.at(i);
writeScalar<uint64_t>(alt.size(), outStream);
outStream.write(alt.c_str(), alt.size());
writeString(alt, outStream);
}
}

/**
* Write the table of individual identifiers.
*
* This is an optional part of the IGD file. You can create and IGD and not call this method, in
* which case the sample IDs will just be empty.
* This is an optional part of the IGD file. You can create an IGD and not call this method, in
* which case the individual ID table will just be empty.
*
* @param[in] outStream The output stream.
* @param[in] labels A list (vector) of string identifiers. One id per individual.
Expand All @@ -1773,12 +1809,38 @@ class IGDWriter {

writeScalar<uint64_t>(labels.size(), outStream);
for (const auto& label : labels) {
writeScalar<uint64_t>(label.size(), outStream);
outStream.write(label.c_str(), label.size());
writeString(label, outStream);
}
}
}

/**
* Write the table of variant identifiers.
*
* This is an optional part of the IGD file. You can create an IGD and not call this method, in
* which case the variant ID table will just be empty.
*
* @param[in] outStream The output stream.
* @param[in] labels A list (vector) of string identifiers. One id per variant.
*/
void writeVariantIds(std::ostream& outStream, const std::vector<std::string>& labels) {
if (labels.empty()) {
m_header.filePosVariantIds = 0;
} else {
m_header.filePosVariantIds = outStream.tellp();
PICOVCF_RELEASE_ASSERT(0 != m_header.filePosVariantIds);
if (labels.size() != m_header.numVariants) {
PICOVCF_THROW_ERROR(ApiMisuse, "Must provide one label per individual");
}

writeScalar<uint64_t>(labels.size(), outStream);
for (const auto& label : labels) {
writeString(label, outStream);
}
}
}


size_t m_sparseCount{};
size_t m_totalCount{};
private:
Expand Down Expand Up @@ -1819,11 +1881,16 @@ class IGDWriter {
* @param[in] vcfFilename The name of the input VCF file to be converted.
* @param[in] outFilename The name of the output IGD file to be created.
* @param[in] description [Optional] A description of the dataset.
* @param[in] verbose [Optional] Set to true to get statistics printed to stdout.
* @param[in] emitIndividualIds [Optional] Copy individual IDs to IGD file (false by default).
* @param[in] emitVariantIds [Optional] Copy variant IDs to IGD file (false by default).
*/
inline void vcfToIGD(const std::string& vcfFilename,
const std::string& outFilename,
std::string description = "",
bool verbose = false) {
bool verbose = false,
bool emitIndividualIds = false,
bool emitVariantIds = false) {
VCFFile vcf(vcfFilename);
vcf.seekBeforeVariants();
PICOVCF_ASSERT_OR_MALFORMED(vcf.hasNextVariant(), "VCF file has no variants");
Expand All @@ -1838,6 +1905,7 @@ inline void vcfToIGD(const std::string& vcfFilename,
const bool isPhased = firstIndividual.getAlleles(allele1, allele2, /*moveNext=*/false);
const uint64_t ploidy = (allele2 == NOT_DIPLOID) ? 1 : 2;

std::vector<std::string> variantIds;
std::ofstream outFile(outFilename, std::ios::binary);
IGDWriter writer(ploidy, vcf.numIndividuals(), isPhased);
writer.writeHeader(outFile, vcfFilename, description);
Expand Down Expand Up @@ -1870,13 +1938,20 @@ inline void vcfToIGD(const std::string& vcfFilename,
sampleIndex++;
}
}
std::string currentVariantId;
if (emitVariantIds) {
currentVariantId = variant.getID();
}
for (size_t altIndex = 0; altIndex < altAlleles.size(); altIndex++) {
writer.writeVariantSamples(outFile,
variant.getPosition(),
variant.getRefAllele(),
altAlleles[altIndex],
variantGtData[altIndex],
false);
variant.getPosition(),
variant.getRefAllele(),
altAlleles[altIndex],
variantGtData[altIndex],
false);
if (emitVariantIds) {
variantIds.emplace_back(currentVariantId);
}
}
if (!missingData.empty()) {
writer.writeVariantSamples(outFile,
Expand All @@ -1885,11 +1960,19 @@ inline void vcfToIGD(const std::string& vcfFilename,
"",
missingData,
true);
if (emitVariantIds) {
variantIds.emplace_back(currentVariantId);
}
}
}
writer.writeIndex(outFile);
writer.writeVariantInfo(outFile);
writer.writeIndividualIds(outFile, vcf.getIndividualLabels());
if (emitIndividualIds) {
writer.writeIndividualIds(outFile, vcf.getIndividualLabels());
}
if (emitVariantIds) {
writer.writeVariantIds(outFile, variantIds);
}
outFile.seekp(0);
writer.writeHeader(outFile, vcfFilename, description);

Expand Down

0 comments on commit aad0319

Please sign in to comment.