Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to new (backwards-compatible) picovcf version #9

Merged
merged 1 commit into from
Aug 28, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 130 additions & 47 deletions third-party/picovcf/picovcf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,22 @@ static inline T readScalar(std::istream& inStream) {
return simpleValue;
}

static inline size_t readStringLen(const uint64_t version, std::istream& inStream) {
if (version == 3) {
return readScalar<uint64_t>(inStream);
}
return readScalar<uint32_t>(inStream);
}

static inline std::string readString(const uint64_t version, std::istream& inStream) {
const auto strLength = readStringLen(version, inStream);
PICOVCF_GOOD_OR_MALFORMED_FILE(inStream);
std::string strValue;
strValue.resize(strLength);
inStream.read(const_cast<char*>(strValue.c_str()), strLength);
return std::move(strValue);
}

/** Vector of sample indexes (IDs) */
using IGDSampleList = std::vector<SampleT>;

Expand Down Expand Up @@ -1272,10 +1288,10 @@ class IGDData {
/** Flag that indicates the genotype data is phased */
static constexpr uint64_t IGD_PHASED = 0x1;
/** The IGD file format version that this library writes */
static constexpr uint64_t CURRENT_IGD_VERSION = 3;
static constexpr uint64_t CURRENT_IGD_VERSION = 4;
/** If fewer than NumSamples/DEFAULT_SPARSE_THRESHOLD samples have a particular variant, we will
store it sparsely. */
static constexpr uint32_t DEFAULT_SPARSE_THRESHOLD = 128;
static constexpr uint32_t DEFAULT_SPARSE_THRESHOLD = 32;

/* See IGD.FORMAT.md for detailed file format. The layout is:
* - FixedHeader (128 bytes)
Expand Down Expand Up @@ -1307,7 +1323,8 @@ class IGDData {
uint64_t filePosIndex; // Byte offset in the file where the variant rows start.
uint64_t filePosVariants; // Byte offset in the file where the variant allele information start.
uint64_t filePosIndividualIds; // Byte offset in the file where the identifiers for individuals start.
uint64_t unused[7];
uint64_t filePosVariantIds; // Byte offset in the file where the identifiers for variants start.
uint64_t unused[6];
};
static_assert(sizeof(FixedHeader) == 128, "Header size is fixed");

Expand Down Expand Up @@ -1380,6 +1397,14 @@ class IGDData {
return m_source;
}

/**
* Free-form description of the file contents.
* @return A string description.
*/
std::string getDescription() const {
return m_description;
}

/**
* The ploidy of _all_ the data contained in the file.
* @return ploidy value >= 1.
Expand Down Expand Up @@ -1504,7 +1529,10 @@ class IGDData {
}

/**
* Read the (optional) list of label identifiers from the file.
* Read the (optional) list of individual identifiers from the file.
*
* @return A newly created std::vector<std::string> object. Individual 0 has its label at
* position 0, and the last individual has its label at position (numIndividuals()-1).
*/
std::vector<std::string> getIndividualIds() {
std::vector<std::string> result;
Expand All @@ -1513,15 +1541,31 @@ class IGDData {
const size_t numLabels = readScalar<uint64_t>(m_infile);
PICOVCF_ASSERT_OR_MALFORMED(numLabels == m_header.numIndividuals, "Invalid number of individual ids");
for (size_t i = 0; i < numLabels; i++) {
const size_t idLength = readScalar<uint64_t>(m_infile);
std::string idString;
idString.resize(idLength);
m_infile.read(const_cast<char*>(idString.c_str()), idLength);
result.emplace_back(std::move(idString));
result.emplace_back(std::move(readString(m_header.version, m_infile)));
}
}
return result;
return std::move(result);
}

/**
* Read the (optional) list of variant identifiers from the file.
*
* @return A newly created std::vector<std::string> object. Variant 0 has its label at
* position 0, and the last variant has its identifier at position (numVariants()-1).
*/
std::vector<std::string> getVariantIds() {
std::vector<std::string> result;
if (0 != m_header.filePosVariantIds) {
m_infile.seekg(m_header.filePosVariantIds);
const size_t numLabels = readScalar<uint64_t>(m_infile);
PICOVCF_ASSERT_OR_MALFORMED(numLabels == m_header.numVariants, "Invalid number of variant ids");
for (size_t i = 0; i < numLabels; i++) {
result.emplace_back(std::move(readString(m_header.version, m_infile)));
}
}
return std::move(result);
}

private:
size_t getVariantIndexOffset(VariantT variantIndex) {
return m_header.filePosIndex + (variantIndex * sizeof(IndexEntry));
Expand All @@ -1532,13 +1576,12 @@ class IGDData {
PICOVCF_GOOD_OR_MALFORMED_FILE(m_infile);
PICOVCF_ASSERT_OR_MALFORMED(m_header.magic == IGD_MAGIC,
"Invalid magic " << std::hex << m_header.magic);
PICOVCF_ASSERT_OR_MALFORMED(m_header.version == CURRENT_IGD_VERSION,
static_assert(CURRENT_IGD_VERSION == 4, "Remove/check backwards compatibility if incrementing version");
PICOVCF_ASSERT_OR_MALFORMED(m_header.version == CURRENT_IGD_VERSION
|| m_header.version == 3,
"Unsupported file format version " << m_header.version);
const size_t sourceLen = readScalar<uint64_t>(m_infile);
m_source.resize(sourceLen);
m_infile.read(const_cast<char*>(m_source.c_str()), sourceLen);
const size_t descriptionLen = readScalar<uint64_t>(m_infile);
m_infile.read(const_cast<char*>(m_description.c_str()), descriptionLen);
m_source = readString(m_header.version, m_infile);
m_description = readString(m_header.version, m_infile);
m_beforeFirstVariant = m_infile.tellg();
PICOVCF_ASSERT_OR_MALFORMED(m_header.filePosVariants > m_beforeFirstVariant,
"Invalid variant info position " << m_header.filePosVariants);
Expand All @@ -1557,23 +1600,15 @@ class IGDData {
m_referenceAlleles.reserve(m_header.numVariants);
m_alternateAlleles.reserve(m_header.numVariants);
for (VariantT i = 0; i < m_header.numVariants; i++) {
std::string reference;
const size_t refLen = readScalar<uint64_t>(m_infile);
PICOVCF_GOOD_OR_MALFORMED_FILE(m_infile);
reference.resize(refLen);
m_infile.read(const_cast<char*>(reference.c_str()), refLen);
std::string reference = readString(m_header.version, m_infile);
PICOVCF_GOOD_OR_MALFORMED_FILE(m_infile);
IGDAllele refAllele(reference, m_longAlleles.size());
if (refAllele.isLong()) {
m_longAlleles.push_back(std::move(reference));
}
m_referenceAlleles.push_back(std::move(refAllele));

std::string alternate;
const size_t altLen = readScalar<uint64_t>(m_infile);
PICOVCF_GOOD_OR_MALFORMED_FILE(m_infile);
alternate.resize(altLen);
m_infile.read(const_cast<char*>(alternate.c_str()), altLen);
std::string alternate = readString(m_header.version, m_infile);
PICOVCF_GOOD_OR_MALFORMED_FILE(m_infile);
IGDAllele altAllele(alternate, m_longAlleles.size());
if (altAllele.isLong()) {
Expand Down Expand Up @@ -1603,6 +1638,11 @@ static inline void writeScalar(T intValue, std::ostream& outStream) {
outStream.write(reinterpret_cast<const char*>(&intValue), sizeof(intValue));
}

static inline void writeString(const std::string& value, std::ostream& outStream) {
writeScalar<uint32_t>(value.size(), outStream);
outStream.write(value.c_str(), value.size());
}

inline void writeAllelesAsOnes(std::ostream& outStream,
const IGDSampleList& orderedSampleList,
SampleT numSamples) {
Expand Down Expand Up @@ -1678,10 +1718,8 @@ class IGDWriter {
const std::string& source,
const std::string& description) {
outStream.write(reinterpret_cast<const char*>(&m_header), sizeof(m_header));
writeScalar<uint64_t>(source.size(), outStream);
outStream.write(source.c_str(), source.size());
writeScalar<uint64_t>(description.size(), outStream);
outStream.write(description.c_str(), description.size());
writeString(source, outStream);
writeString(description, outStream);
}

/**
Expand Down Expand Up @@ -1724,7 +1762,7 @@ class IGDWriter {

/**
* Write the table of information about the variants. This information is collected and saved
* by writeSamplesRow(), so this function must be called _after_ that one.
* by writeVariantSamples(), so this function must be called _after_ that one.
* @param[in] outStream The output stream.
*/
void writeIndex(std::ostream& outStream) {
Expand All @@ -1736,27 +1774,25 @@ class IGDWriter {

/**
* Write the table of information about the variants. This information is collected and saved
* by writeSamplesRow(), so this function must be called _after_ that one.
* by writeVariantSamples(), so this function must be called _after_ that one.
* @param[in] outStream The output stream.
*/
void writeVariantInfo(std::ostream& outStream) {
m_header.filePosVariants = outStream.tellp();
for (VariantT i = 0; i < m_header.numVariants; i++) {
const std::string& ref = m_referenceAlleles.at(i);
writeScalar<uint64_t>(ref.size(), outStream);
outStream.write(ref.c_str(), ref.size());
writeString(ref, outStream);

const std::string& alt = m_alternateAlleles.at(i);
writeScalar<uint64_t>(alt.size(), outStream);
outStream.write(alt.c_str(), alt.size());
writeString(alt, outStream);
}
}

/**
* Write the table of individual identifiers.
*
* This is an optional part of the IGD file. You can create and IGD and not call this method, in
* which case the sample IDs will just be empty.
* This is an optional part of the IGD file. You can create an IGD and not call this method, in
* which case the individual ID table will just be empty.
*
* @param[in] outStream The output stream.
* @param[in] labels A list (vector) of string identifiers. One id per individual.
Expand All @@ -1773,12 +1809,38 @@ class IGDWriter {

writeScalar<uint64_t>(labels.size(), outStream);
for (const auto& label : labels) {
writeScalar<uint64_t>(label.size(), outStream);
outStream.write(label.c_str(), label.size());
writeString(label, outStream);
}
}
}

/**
* Write the table of variant identifiers.
*
* This is an optional part of the IGD file. You can create an IGD and not call this method, in
* which case the variant ID table will just be empty.
*
* @param[in] outStream The output stream.
* @param[in] labels A list (vector) of string identifiers. One id per variant.
*/
void writeVariantIds(std::ostream& outStream, const std::vector<std::string>& labels) {
if (labels.empty()) {
m_header.filePosVariantIds = 0;
} else {
m_header.filePosVariantIds = outStream.tellp();
PICOVCF_RELEASE_ASSERT(0 != m_header.filePosVariantIds);
if (labels.size() != m_header.numVariants) {
PICOVCF_THROW_ERROR(ApiMisuse, "Must provide one label per individual");
}

writeScalar<uint64_t>(labels.size(), outStream);
for (const auto& label : labels) {
writeString(label, outStream);
}
}
}


size_t m_sparseCount{};
size_t m_totalCount{};
private:
Expand Down Expand Up @@ -1819,11 +1881,16 @@ class IGDWriter {
* @param[in] vcfFilename The name of the input VCF file to be converted.
* @param[in] outFilename The name of the output IGD file to be created.
* @param[in] description [Optional] A description of the dataset.
* @param[in] verbose [Optional] Set to true to get statistics printed to stdout.
* @param[in] emitIndividualIds [Optional] Copy individual IDs to IGD file (false by default).
* @param[in] emitVariantIds [Optional] Copy variant IDs to IGD file (false by default).
*/
inline void vcfToIGD(const std::string& vcfFilename,
const std::string& outFilename,
std::string description = "",
bool verbose = false) {
bool verbose = false,
bool emitIndividualIds = false,
bool emitVariantIds = false) {
VCFFile vcf(vcfFilename);
vcf.seekBeforeVariants();
PICOVCF_ASSERT_OR_MALFORMED(vcf.hasNextVariant(), "VCF file has no variants");
Expand All @@ -1838,6 +1905,7 @@ inline void vcfToIGD(const std::string& vcfFilename,
const bool isPhased = firstIndividual.getAlleles(allele1, allele2, /*moveNext=*/false);
const uint64_t ploidy = (allele2 == NOT_DIPLOID) ? 1 : 2;

std::vector<std::string> variantIds;
std::ofstream outFile(outFilename, std::ios::binary);
IGDWriter writer(ploidy, vcf.numIndividuals(), isPhased);
writer.writeHeader(outFile, vcfFilename, description);
Expand Down Expand Up @@ -1870,13 +1938,20 @@ inline void vcfToIGD(const std::string& vcfFilename,
sampleIndex++;
}
}
std::string currentVariantId;
if (emitVariantIds) {
currentVariantId = variant.getID();
}
for (size_t altIndex = 0; altIndex < altAlleles.size(); altIndex++) {
writer.writeVariantSamples(outFile,
variant.getPosition(),
variant.getRefAllele(),
altAlleles[altIndex],
variantGtData[altIndex],
false);
variant.getPosition(),
variant.getRefAllele(),
altAlleles[altIndex],
variantGtData[altIndex],
false);
if (emitVariantIds) {
variantIds.emplace_back(currentVariantId);
}
}
if (!missingData.empty()) {
writer.writeVariantSamples(outFile,
Expand All @@ -1885,11 +1960,19 @@ inline void vcfToIGD(const std::string& vcfFilename,
"",
missingData,
true);
if (emitVariantIds) {
variantIds.emplace_back(currentVariantId);
}
}
}
writer.writeIndex(outFile);
writer.writeVariantInfo(outFile);
writer.writeIndividualIds(outFile, vcf.getIndividualLabels());
if (emitIndividualIds) {
writer.writeIndividualIds(outFile, vcf.getIndividualLabels());
}
if (emitVariantIds) {
writer.writeVariantIds(outFile, variantIds);
}
outFile.seekp(0);
writer.writeHeader(outFile, vcfFilename, description);

Expand Down
Loading