//===========================================================================
/*!
*
*
* \brief concatenation of two models, with type erasure
*
*
*
* \author O. Krause
* \date 2010-2011
*
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
//===========================================================================
#ifndef SHARK_MODEL_CONCATENATEDMODEL_H
#define SHARK_MODEL_CONCATENATEDMODEL_H
#include
#include
#include
namespace shark {
///\brief ConcatenatedModel concatenates two models such that the output of the first model is input to the second.
///
///Sometimes a series of models is needed to generate the desired output. For example when input data needs to be
///normalized before it can be put into the trained model. In this case, the ConcatenatedModel can be used to
///represent this series as one model.
///The easiest way to do is is using the operator >> of AbstractModel:
///ConcatenatedModel model = model1>>model2;
///InputType must be the type of input model1 receives and model2 the output of model2. The output of model1 and input
///of model2 must match. Another way of construction is calling the constructor of ConcatenatedModel using the constructor:
/// ConcatenatedModel model (&modell,&model2);
///warning: model1 and model2 must outlive model. When they are destroyed first, behavior is undefined.
template
class ConcatenatedModel: public AbstractModel {
private:
typedef AbstractModel base_type;
public:
typedef typename base_type::BatchInputType BatchInputType;
typedef typename base_type::BatchOutputType BatchOutputType;
typedef typename base_type::ParameterVectorType ParameterVectorType;
/// \brief From INameable: return the class name.
std::string name() const
{ return "ConcatenatedModel"; }
///\brief Returns the expected shape of the input
Shape inputShape() const{
return m_layers.front().model->inputShape();
}
///\brief Returns the shape of the output
Shape outputShape() const{
return m_layers.back().model->outputShape();
}
void add(AbstractModel* layer, bool optimize){
m_layers.push_back({layer,optimize});
enableModelOptimization(m_layers.size()-1, optimize);//recompute capabilities
}
///\brief sets whether the parameters of the index-th model should be optimized
///
/// If the model has non-differentiable submodels disabling those will make
/// the whole model differentiable.
/// Note that the models are ordered as model0 >> model1>> model2>>...
void enableModelOptimization(std::size_t index, bool opt){
SIZE_CHECK(index < m_layers.size());
m_layers[index].optimize = opt;
this->m_features.reset();
bool inputDerivative = true;
bool parameterDerivative = true;
for(std::size_t k = 0; k != m_layers.size(); ++k){
auto const& layer = m_layers[m_layers.size() - k -1];//we iterate backwards through the layers
if( layer.optimize && (!layer.model->hasFirstParameterDerivative() || !inputDerivative)){
parameterDerivative = false;
}
if( !layer.model->hasFirstInputDerivative()){
inputDerivative = false;
}
}
if (parameterDerivative){
this->m_features |= base_type::HAS_FIRST_PARAMETER_DERIVATIVE;
}
if (inputDerivative){
this->m_features |= base_type::HAS_FIRST_INPUT_DERIVATIVE;
}
}
ParameterVectorType parameterVector() const {
ParameterVectorType params(numberOfParameters());
std::size_t pos = 0;
for(auto layer: m_layers){
if(!layer.optimize) continue;
ParameterVectorType layerParams = layer.model->parameterVector();
noalias(subrange(params,pos,pos+layerParams.size())) = layerParams;
pos += layerParams.size();
}
return params;
}
void setParameterVector(ParameterVectorType const& newParameters) {
std::size_t pos = 0;
for(auto layer: m_layers){
if(!layer.optimize) continue;
ParameterVectorType layerParams = subrange(newParameters,pos,pos+layer.model->numberOfParameters());
layer.model->setParameterVector(layerParams);
pos += layerParams.size();
}
}
std::size_t numberOfParameters() const{
std::size_t numParams = 0;
for(auto layer: m_layers){
if(!layer.optimize) continue;
numParams += layer.model->numberOfParameters();
}
return numParams;
}
boost::shared_ptr createState()const{
InternalState* state = new InternalState;
for(std::size_t i = 0; i != m_layers.size(); ++i){
state->state.push_back(m_layers[i].model->createState());
state->intermediates.push_back(BatchOutputType());
}
return boost::shared_ptr(state);
}
BatchOutputType const& hiddenResponses(State const& state, std::size_t index)const{
InternalState const& s = state.toState();
return s.intermediates[index];
}
State const& hiddenState(State const& state, std::size_t index)const{
InternalState const& s = state.toState();
return *s.state[index];
}
using base_type::eval;
void eval(BatchInputType const& patterns, BatchOutputType& outputs)const {
BatchOutputType intermediates;
outputs = patterns;
for(auto layer: m_layers){
swap(intermediates,outputs);
layer.model->eval(intermediates,outputs);
}
}
void eval(BatchInputType const& patterns, BatchOutputType& outputs, State& state)const{
InternalState& s = state.toState();
outputs = patterns;
for(std::size_t i = 0; i != m_layers.size(); ++i){
if(i == 0)
m_layers[i].model->eval(patterns,s.intermediates[i], *s.state[i]);
else
m_layers[i].model->eval(s.intermediates[i-1],s.intermediates[i], *s.state[i]);
}
outputs = s.intermediates.back();
}
void weightedParameterDerivative(
BatchInputType const& patterns,
BatchOutputType const & outputs,
BatchOutputType const& coefficients,
State const& state,
ParameterVectorType& gradient
)const{
InternalState const& s = state.toState();
BatchOutputType inputDerivativeLast;
BatchOutputType inputDerivative = coefficients;
gradient.resize(numberOfParameters());
std::size_t paramEnd = gradient.size();
for(std::size_t k = 0; k != m_layers.size(); ++k){
std::size_t i = m_layers.size() - k -1;//we iterate backwards through the layers
BatchInputType const* pInput = &patterns;
if(i != 0)
pInput = &s.intermediates[i-1];
swap(inputDerivativeLast,inputDerivative);
//if the current layer does not need to be optimized, we just check whether we have to compute the chain rule
if(!m_layers[i].optimize || m_layers[i].model->numberOfParameters() == 0){
if(i != 0) //check, if we are done, the input layer does not need to compute anything
m_layers[i].model->weightedInputDerivative(*pInput,s.intermediates[i], inputDerivativeLast, *s.state[i], inputDerivative);
}else{
ParameterVectorType paramDerivative;
if(i != 0){//if we are in an intermediates layer, compute chain rule
m_layers[i].model->weightedDerivatives(*pInput,s.intermediates[i], inputDerivativeLast, *s.state[i], paramDerivative,inputDerivative);
}
else{//lowest layer only needs to compute parameter derivative
m_layers[i].model->weightedParameterDerivative(*pInput,s.intermediates[i], inputDerivativeLast, *s.state[i], paramDerivative);
}
noalias(subrange(gradient,paramEnd - paramDerivative.size(),paramEnd)) = paramDerivative;
paramEnd -= paramDerivative.size();
}
}
}
void weightedInputDerivative(
BatchInputType const& patterns,
BatchOutputType const & outputs,
BatchOutputType const& coefficients,
State const& state,
BatchOutputType& derivatives
)const{
InternalState const& s = state.toState();
BatchOutputType derivativeLast;
derivatives = coefficients;
for(std::size_t k = 0; k != m_layers.size(); ++k){
std::size_t i = m_layers.size() - k -1;//we iterate backwards through the layers
BatchInputType const* pInput = &patterns;
if(i != 0)
pInput = &s.intermediates[i-1];
swap(derivativeLast,derivatives);
m_layers[i].model->weightedInputDerivative(*pInput,s.intermediates[i], derivativeLast, *s.state[i], derivatives);
}
}
virtual void weightedDerivatives(
BatchInputType const & patterns,
BatchOutputType const & outputs,
BatchOutputType const & coefficients,
State const& state,
ParameterVectorType& gradient,
BatchInputType& inputDerivative
)const{
InternalState const& s = state.toState();
BatchOutputType inputDerivativeLast;
inputDerivative = coefficients;
gradient.resize(numberOfParameters());
std::size_t paramEnd = gradient.size();
for(std::size_t k = 0; k != m_layers.size(); ++k){
std::size_t i = m_layers.size() - k -1;//we iterate backwards through the layers
BatchInputType const* pInput = &patterns;
if(i != 0)
pInput = &s.intermediates[i-1];
swap(inputDerivativeLast,inputDerivative);
//if the current layer does not need to be optimized, we just check whether we have to compute the chain rule
if(!m_layers[i].optimize || m_layers[i].model->numberOfParameters() == 0){
m_layers[i].model->weightedInputDerivative(*pInput,s.intermediates[i], inputDerivativeLast, *s.state[i], inputDerivative);
}else{
ParameterVectorType paramDerivative;
m_layers[i].model->weightedDerivatives(*pInput,s.intermediates[i], inputDerivativeLast, *s.state[i], paramDerivative,inputDerivative);
noalias(subrange(gradient,paramEnd - paramDerivative.size(),paramEnd)) = paramDerivative;
paramEnd -= paramDerivative.size();
}
}
}
/// From ISerializable
void read( InArchive & archive ){
for(auto layer: m_layers){
archive >> *layer.model;
archive >> layer.optimize;
}
}
/// From ISerializable
void write( OutArchive & archive ) const{
for(auto layer: m_layers){
archive << *layer.model;
archive << layer.optimize;
}
}
private:
struct Layer{
AbstractModel* model;
bool optimize;
};
std::vector m_layers;
struct InternalState: State{
std::vector > state;
std::vector intermediates;
};
};
///\brief Connects two AbstractModels so that the output of the first model is the input of the second.
template
ConcatenatedModel operator>>(
AbstractModel& firstModel,
AbstractModel& secondModel
){
ConcatenatedModel sequence;
sequence.add(&firstModel, true);
sequence.add(&secondModel, true);
return sequence;
}
template
ConcatenatedModel operator>>(
ConcatenatedModel const& firstModel,
AbstractModel& secondModel
){
ConcatenatedModel sequence = firstModel;
sequence.add(&secondModel, true);
return sequence;
}
}
#endif