//===========================================================================
/*!
*
*
* \brief Support for importing data from HDF5 file
*
*
* \par
* The most important application of the methods provided in this
* file is the import of data from HDF5 files into Shark data
* containers.
*
*
*
*
* \author B. Li
* \date 2012
*
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
//===========================================================================
#ifndef SHARK_DATA_HDF5_H
#define SHARK_DATA_HDF5_H
#include "shark/Core/utility/ScopedHandle.h"
#include "shark/Data/Dataset.h"
#include // This must come before #include
#include
#include
#include
#include
#include
#include
#include
namespace shark {
namespace detail {
/// Overload functions so that complier is able to automatically detect which function to call
/// @note
/// Basically there are two ways to add support for other data types:
/// (a) Use other corresponding API H5HTpublic.h if the type is supported(luckily)
/// (b) Use H5LTread_dataset() but need pass in the type_id which are listed at:
/// http://www.hdfgroup.org/HDF5/doc/RM/PredefDTypes.html
/// Need pay special attention to endian.
///@{
herr_t readHDF5Dataset( hid_t loc_id, const char *dset_name, int *buffer )
{
return H5LTread_dataset_int( loc_id, dset_name, buffer );
}
herr_t readHDF5Dataset( hid_t loc_id, const char *dset_name, long *buffer )
{
return H5LTread_dataset_long( loc_id, dset_name, buffer );
}
herr_t readHDF5Dataset( hid_t loc_id, const char *dset_name, float *buffer )
{
return H5LTread_dataset_float( loc_id, dset_name, buffer );
}
herr_t readHDF5Dataset( hid_t loc_id, const char *dset_name, double *buffer )
{
return H5LTread_dataset_double( loc_id, dset_name, buffer );
}
///@}
/// Check whether typeClass and typeSize are supported by current implementation
template
bool isSupported(H5T_class_t typeClass, size_t typeSize)
{
if (H5T_FLOAT == typeClass && 8 == typeSize && std::is_floating_point < RawValueType > ::value
&& sizeof(RawValueType) == 8) {
// double
return true;
} else if (H5T_FLOAT == typeClass && 4 == typeSize && std::is_floating_point < RawValueType > ::value
&& sizeof(RawValueType) == 4) {
// float
return true;
} else if (H5T_INTEGER == typeClass && 4 == typeSize && std::is_integral < RawValueType > ::value
&& sizeof(RawValueType) == 4) {
// int
return true;
} else if (H5T_INTEGER == typeClass && 8 == typeSize && std::is_integral < RawValueType > ::value
&& sizeof(RawValueType) == 8) {
// long
return true;
}
return false;
}
/// @brief Load a dataset in a HDF5 file into a matrix
///
/// @param data
/// in vector of vector format which should support assignment operations
/// @param fileName
/// The name of HDF5 file to be read from
/// @param dataSetName
/// the HDF5 dataset name to access in the HDF5 file
///
/// @tparam MatrixType
/// The type of data container which will accept read-in data and should be a 2-dimension matrix
template
void loadIntoMatrix(MatrixType& data, const std::string& fileName, const std::string& dataSetName)
{
typedef typename MatrixType::value_type VectorType; // e.g., std::vector
typedef typename VectorType::value_type RawValueType; // e.g., double
// Disable HDF5 diagnosis message which could be commented out in case of debugging HDF5 related issues
H5Eset_auto1(0, 0);
// 64 is big enough for HDF5, which supports no more than 32 dimensions presently
const size_t MAX_DIMENSIONS = 64u;
// Open the file, and then get dimension
hid_t open = H5Fopen(fileName.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
if(open < 0)
throw SHARKEXCEPTION((boost::format("[loadIntoMatrix] open file name: %1% (FAILED)") % fileName).str());
const ScopedHandle fileId(
open,
H5Fclose
);
boost::array dims;
dims.assign(0);
H5T_class_t typeClass;
size_t typeSize;
SHARK_RUNTIME_CHECK(
H5LTget_dataset_info(*fileId, dataSetName.c_str(), dims.c_array(), &typeClass, &typeSize) >= 0,
(boost::format("Get data set(%1%) info from file(%2%).") % dataSetName % fileName).str());
if (0 == dims[0])
return;
// Support 1 or 2 dimensions only at the moment
SHARK_RUNTIME_CHECK(
0 == dims[2],
(boost::format(
"[%1%][%2%] Support 1 or 2 dimensions, but this dataset has at least 3 dimensions.") % fileName % dataSetName).str());
const hsize_t dim0 = dims[0];
const hsize_t dim1 = (0 == dims[1]) ? 1 : dims[1]; // treat one dimension as two-dimension of N x 1
SHARK_RUNTIME_CHECK(
detail::isSupported(typeClass, typeSize),
(boost::format(
"DataType doesn't match. HDF5 data type in dataset(%3%::%4%): %1%, size: %2%")
% typeClass
% typeSize
% fileName
% dataSetName).str());
// Read data into a buffer
const boost::scoped_array dataBuffer(new RawValueType[dim0 * dim1]);
SHARK_RUNTIME_CHECK(detail::readHDF5Dataset(*fileId, dataSetName.c_str(), dataBuffer.get()) >= 0, " Read data set.");
// dims[0] = M, dims[1] = N, means each basic vector has M elements, and there are N of them.
for (size_t i = 0; i < dim1; ++i) {
VectorType sample(dim0);
for (size_t j = 0; j < dim0; ++j)
sample[j] = dataBuffer[i + j * dim1]; // elements in memory are in row-major order
data.push_back(sample);
}
}
/// @brief load a matrix from HDF5 file in compressed sparse column format
///
/// @param data the container which will hold the output matrix
/// @param fileName the name of HDF5 file
/// @param cscDatasetName dataset names for describing the CSC
template
void loadHDF5Csc(MatrixType& data, const std::string& fileName, const std::vector& cscDatasetName)
{
typedef typename MatrixType::value_type VectorType; // e.g., std::vector
SHARK_RUNTIME_CHECK(
3 == cscDatasetName.size(),
"Must provide 3 dataset names for importing Compressed Sparse Column format.");
std::vector valBuf;
std::vector > indicesBuf;
std::vector > indexPtrBuf;
detail::loadIntoMatrix(valBuf, fileName, cscDatasetName[0]);
detail::loadIntoMatrix(indicesBuf, fileName, cscDatasetName[1]);
detail::loadIntoMatrix(indexPtrBuf, fileName, cscDatasetName[2]);
SHARK_RUNTIME_CHECK(1u == valBuf.size() && 1u == indicesBuf.size() && 1u == indexPtrBuf.size(), "All datasets should be of one dimension.");
const VectorType& val = valBuf.front();
const std::vector& indices = indicesBuf.front(); // WARNING: Not all indices are of int32 type
const std::vector& indexPtr = indexPtrBuf.front();
SHARK_RUNTIME_CHECK(val.size() == indices.size(), "Size of value and indices should be the same.");
SHARK_RUNTIME_CHECK(indexPtr.back() == (boost::int32_t)val.size(), "Last element of index pointer should equal to size of value.");
// Figure out dimensions of dense matrix
const boost::uint32_t columnCount = indexPtr.size() - 1; // the last one is place holder
const boost::uint32_t rowCount = *boost::max_element(indices) + 1; // max index plus 1
data.resize(columnCount);
boost::fill(data, VectorType(rowCount, 0)); // pre-fill zero
size_t valIdx = 0;
for (size_t i = 0; i < columnCount; ++i) {
for (boost::int32_t j = indexPtr[i]; j < indexPtr[i + 1]; ++j) {
data[i][indices[j]] = val[valIdx++];
}
}
}
/// @brief Construct labeled data from passed in data and label
///
/// @param labeledData
/// Container storing the loaded data
/// @param dataBuffer
/// The data container will hold
/// @param labelBuffer
/// The label for data inside @a dataBuffer
template
void constructLabeledData(
LabeledData& labeledData,
const std::vector& dataBuffer,
const std::vector >& labelBuffer)
{
SHARK_RUNTIME_CHECK(
1 == labelBuffer.size(),
(boost::format("Expect only one label vector, but get %1%.") % labelBuffer.size()).str());
SHARK_RUNTIME_CHECK(
dataBuffer.size() == labelBuffer.front().size(),
boost::format("Dimensions of data and label don't match.").str());
labeledData = createLabeledDataFromRange(dataBuffer, labelBuffer.front());
}
} // namespace details
/// @brief Import data from a HDF5 file.
///
/// @param data Container storing the loaded data
/// @param fileName The name of HDF5 file to be read from
/// @param datasetName the HDF5 dataset name to access in the HDF5 file
///
/// @tparam VectorType Type of object stored in Shark data container
template
void importHDF5(
Data& data,
const std::string& fileName,
const std::string& datasetName)
{
std::vector readinBuffer;
detail::loadIntoMatrix(readinBuffer, fileName, datasetName);
data = createDataFromRange(readinBuffer);
}
/// @brief Import data to a LabeledData object from a HDF5 file.
///
/// @param labeledData
/// Container storing the loaded data
/// @param fileName
/// The name of HDF5 file to be read from
/// @param data
/// the HDF5 dataset name for data
/// @param label
/// the HDF5 dataset name for label
///
/// @tparam VectorType
/// Type of object stored in Shark data container
/// @tparam LableType
/// Type of label
template
void importHDF5(
LabeledData& labeledData,
const std::string& fileName,
const std::string& data,
const std::string& label)
{
std::vector readinData;
std::vector < std::vector > readinLabel;
detail::loadIntoMatrix(readinData, fileName, data);
detail::loadIntoMatrix(readinLabel, fileName, label);
detail::constructLabeledData(labeledData, readinData, readinLabel);
}
/// @brief Import data from HDF5 dataset of compressed sparse column format.
///
/// @param data Container storing the loaded data
/// @param fileName The name of HDF5 file to be read from
/// @param cscDatasetName
/// the CSC dataset names used to construct a matrix
///
/// @tparam VectorType Type of object stored in Shark data container
template
void importHDF5(
Data& data,
const std::string& fileName,
const std::vector& cscDatasetName)
{
std::vector readinBuffer;
detail::loadHDF5Csc(readinBuffer, fileName, cscDatasetName);
data = createDataFromRange(readinBuffer);
}
/// @brief Import data from HDF5 dataset of compressed sparse column format.
///
/// @param labeledData
/// Container storing the loaded data
/// @param fileName
/// The name of HDF5 file to be read from
/// @param cscDatasetName
/// the CSC dataset names used to construct a matrix
/// @param label
/// the HDF5 dataset name for label
///
/// @tparam VectorType
/// Type of object stored in Shark data container
/// @tparam LabelType
/// Type of label
template
void importHDF5(
LabeledData& labeledData,
const std::string& fileName,
const std::vector& cscDatasetName,
const std::string& label)
{
std::vector readinData;
std::vector < std::vector > readinLabel;
detail::loadHDF5Csc(readinData, fileName, cscDatasetName);
detail::loadIntoMatrix(readinLabel, fileName, label);
detail::constructLabeledData(labeledData, readinData, readinLabel);
}
} // namespace shark {
#endif // SHARK_DATA_HDF5_H