From fc2423d27595bcd2d4aa3846212fcf7dba732f48 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 14 Mar 2019 20:58:26 -0400 Subject: [PATCH 1/4] [src] starting to refactor matrix/vector; drafting tensor stuff --- egs/mini_librispeech/s5/cmd.sh | 6 +- src/matrix/kaldi-matrix.cc | 64 ++++++----- src/matrix/kaldi-matrix.h | 47 ++++++--- src/matrix/kaldi-vector.cc | 14 +-- src/matrix/kaldi-vector.h | 63 ++++++----- src/tensor/tensor.h | 187 +++++++++++++++++++++++++++++++++ 6 files changed, 298 insertions(+), 83 deletions(-) create mode 100644 src/tensor/tensor.h diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh index 71dd849a93b..223eb21c55d 100644 --- a/egs/mini_librispeech/s5/cmd.sh +++ b/egs/mini_librispeech/s5/cmd.sh @@ -10,6 +10,6 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="queue.pl --mem 2G" -export decode_cmd="queue.pl --mem 4G" -export mkgraph_cmd="queue.pl --mem 8G" +export train_cmd="retry.pl --num-tries 3 queue.pl --mem 2G" +export decode_cmd="retry.pl --num-tries 3 queue.pl --mem 4G" +export mkgraph_cmd="retry.pl --num-tries 3 queue.pl --mem 8G" diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc index fcfe0616b64..48481f5f19e 100644 --- a/src/matrix/kaldi-matrix.cc +++ b/src/matrix/kaldi-matrix.cc @@ -28,7 +28,7 @@ #include "matrix/compressed-matrix.h" #include "matrix/sparse-matrix.h" -static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans), +static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans), "kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!"); namespace kaldi { @@ -538,7 +538,7 @@ void MatrixBase::AddMatSmat(Real alpha, const MatrixBase &A, // pass stride to write a column as matrices are stored in row major order. cblas_Xaxpy(this_num_rows, alpha_B_jk, a_col_k, A.stride_, this_col_j, this->stride_); - //for (MatrixIndexT i = 0; i < this_num_rows; ++i) + //for (MatrixIndexT i = 0; i < this_num_rows; ++i) // this_col_j[i*this->stride_] += alpha_B_jk * a_col_k[i*A.stride_]; } } @@ -786,12 +786,13 @@ inline void Matrix::Init(const MatrixIndexT rows, KALDI_ASSERT(rows == 0 && cols == 0); this->num_rows_ = 0; this->num_cols_ = 0; - this->stride_ = 0; + this->row_stride_ = 0; + this->col_stride_ = 0; this->data_ = NULL; return; } KALDI_ASSERT(rows > 0 && cols > 0); - MatrixIndexT skip, stride; + MatrixIndexT skip, row_stride; size_t size; void *data; // aligned memory block void *temp; // memory block to be really freed @@ -799,8 +800,8 @@ inline void Matrix::Init(const MatrixIndexT rows, // compute the size of skip and real cols skip = ((16 / sizeof(Real)) - cols % (16 / sizeof(Real))) % (16 / sizeof(Real)); - stride = cols + skip; - size = static_cast(rows) * static_cast(stride) + row_stride = cols + skip; + size = static_cast(rows) * static_cast(row_stride) * sizeof(Real); // allocate the memory and set the right dimensions and parameters @@ -808,7 +809,8 @@ inline void Matrix::Init(const MatrixIndexT rows, MatrixBase::data_ = static_cast (data); MatrixBase::num_rows_ = rows; MatrixBase::num_cols_ = cols; - MatrixBase::stride_ = (stride_type == kDefaultStride ? stride : cols); + MatrixBase::row_stride_ = (stride_type == kDefaultStride ? stride : cols); + MatrixBase::col_stride_ = 1; } else { throw std::bad_alloc(); } @@ -824,7 +826,7 @@ void Matrix::Resize(const MatrixIndexT rows, if (resize_type == kCopyData) { if (this->data_ == NULL || rows == 0) resize_type = kSetZero; // nothing to copy. else if (rows == this->num_rows_ && cols == this->num_cols_ && - (stride_type == kDefaultStride || this->stride_ == this->num_cols_)) { return; } // nothing to do. + (stride_type == kDefaultStride || this->row_stride_ == this->num_cols_)) { return; } // nothing to do. else { // set tmp to a matrix of the desired size; if new matrix // is bigger in some dimension, zero it. @@ -874,12 +876,14 @@ void MatrixBase::CopyFromMat(const MatrixBase &M, (*this).Row(i).CopyFromVec(M.Row(i)); } else { KALDI_ASSERT(num_cols_ == M.NumRows() && num_rows_ == M.NumCols()); - int32 this_stride = stride_, other_stride = M.Stride(); + int32 this_row_stride = row_stride_, this_col_stride = col_stride_, + other_row_stride = M.RowStride(), other_col_stride = M.ColStride(); Real *this_data = data_; const OtherReal *other_data = M.Data(); for (MatrixIndexT i = 0; i < num_rows_; i++) for (MatrixIndexT j = 0; j < num_cols_; j++) - this_data[i * this_stride + j] = other_data[j * other_stride + i]; + this_data[i * this_row_stride + j * this_col_stride] = + other_data[j * other_row_stride + i * other_col_stride]; } } @@ -902,15 +906,17 @@ template<> template<> void MatrixBase::CopyFromSp(const SpMatrix & M) { KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); - MatrixIndexT num_rows = num_rows_, stride = stride_; + MatrixIndexT num_rows = num_rows_, + row_stride = row_stride_, + col_stride = col_stride_; const float *Mdata = M.Data(); float *row_data = data_, *col_data = data_; for (MatrixIndexT i = 0; i < num_rows; i++) { - cblas_scopy(i+1, Mdata, 1, row_data, 1); // copy to the row. - cblas_scopy(i, Mdata, 1, col_data, stride); // copy to the column. - Mdata += i+1; - row_data += stride; - col_data += 1; + cblas_scopy(i + 1, Mdata, 1, row_data, col_stride); // copy to the row. + cblas_scopy(i, Mdata, 1, col_data, row_stride); // copy to the column. + Mdata += i + 1; + row_data += row_stride; + col_data += col_stride; } } @@ -919,15 +925,17 @@ template<> template<> void MatrixBase::CopyFromSp(const SpMatrix & M) { KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); - MatrixIndexT num_rows = num_rows_, stride = stride_; + MatrixIndexT num_rows = num_rows_, + row_stride = row_stride_, + col_stride = col_stride_; const double *Mdata = M.Data(); double *row_data = data_, *col_data = data_; for (MatrixIndexT i = 0; i < num_rows; i++) { - cblas_dcopy(i+1, Mdata, 1, row_data, 1); // copy to the row. - cblas_dcopy(i, Mdata, 1, col_data, stride); // copy to the column. + cblas_dcopy(i+1, Mdata, 1, row_data, col_stride); // copy to the row. + cblas_dcopy(i, Mdata, 1, col_data, row_stride); // copy to the column. Mdata += i+1; - row_data += stride; - col_data += 1; + row_data += row_stride; + col_data += col_stride; } } @@ -956,24 +964,26 @@ template template void MatrixBase::CopyFromTp(const TpMatrix & M, MatrixTransposeType Trans) { + MatrixIndexT row_stride = row_stride_, col_stride = col_stride_; if (Trans == kNoTrans) { KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); SetZero(); Real *out_i = data_; const OtherReal *in_i = M.Data(); - for (MatrixIndexT i = 0; i < num_rows_; i++, out_i += stride_, in_i += i) { + for (MatrixIndexT i = 0; i < num_rows_; + i++, out_i += row_stride_, in_i += i) { for (MatrixIndexT j = 0; j <= i; j++) - out_i[j] = in_i[j]; + out_i[j * col_stride] = in_i[j]; } } else { SetZero(); KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); - MatrixIndexT stride = stride_; Real *out_i = data_; const OtherReal *in_i = M.Data(); - for (MatrixIndexT i = 0; i < num_rows_; i++, out_i ++, in_i += i) { + for (MatrixIndexT i = 0; i < num_rows_; + i++, out_i += col_stride, in_i += i) { for (MatrixIndexT j = 0; j <= i; j++) - out_i[j*stride] = in_i[j]; + out_i[j * row_stride] = in_i[j]; } } } @@ -994,7 +1004,7 @@ void MatrixBase::CopyFromTp(const TpMatrix & M, template void MatrixBase::CopyRowsFromVec(const VectorBase &rv) { - if (rv.Dim() == num_rows_*num_cols_) { + if (rv.Dim() == num_rows_ * num_cols_) { if (stride_ == num_cols_) { // one big copy operation. const Real *rv_data = rv.Data(); diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h index 11a5e08b15d..9148f373c82 100644 --- a/src/matrix/kaldi-matrix.h +++ b/src/matrix/kaldi-matrix.h @@ -65,14 +65,19 @@ class MatrixBase { /// Returns number of columns (or zero for emtpy matrix). inline MatrixIndexT NumCols() const { return num_cols_; } - /// Stride (distance in memory between each row). Will be >= NumCols. - inline MatrixIndexT Stride() const { return stride_; } + /// Stride() is deprecated + inline MatrixIndexT Stride() const { return row_stride_; } + + /// The distance in memory between successive rows. Not required to be + /// positive or even nonzero, as long you can't get to the same + /// memory location using different indexes. + inline MatrixIndexT RowStride() const { return row_stride_; } + + /// The distance in memory between successive columns; will normally + /// be 1 but it may be negative or even zero as long as you + /// can't get to the same memory location using differen indexes. + inline MatrixIndexT ColStride() const { return col_stride_; } - /// Returns size in bytes of the data held by the matrix. - size_t SizeInBytes() const { - return static_cast(num_rows_) * static_cast(stride_) * - sizeof(Real); - } /// Gives pointer to raw data (const). inline const Real* Data() const { @@ -82,18 +87,19 @@ class MatrixBase { /// Gives pointer to raw data (non-const). inline Real* Data() { return data_; } - /// Returns pointer to data for one row (non-const) + /// Returns pointer to data for one row (non-const). + /// Caution: don't assume ColumnStride() is 1. inline Real* RowData(MatrixIndexT i) { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return data_ + i * stride_; + return data_ + i * row_stride_; } /// Returns pointer to data for one row (const) inline const Real* RowData(MatrixIndexT i) const { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return data_ + i * stride_; + return data_ + i * row_stride_; } /// Indexing operator, non-const @@ -103,7 +109,7 @@ class MatrixBase { static_cast(num_rows_) && static_cast(c) < static_cast(num_cols_)); - return *(data_ + r * stride_ + c); + return *(data_ + r * row_stride_ + c * col_stride_); } /// Indexing operator, provided for ease of debugging (gdb doesn't work /// with parenthesis operator). @@ -116,7 +122,7 @@ class MatrixBase { static_cast(num_rows_) && static_cast(c) < static_cast(num_cols_)); - return *(data_ + r * stride_ + c); + return *(data_ + r * row_stride_ + c * col_stride_); } /* Basic setting-to-special values functions. */ @@ -763,13 +769,20 @@ class MatrixBase { /// data memory area Real* data_; - /// these atributes store the real matrix size as it is stored in memory - /// including memalignment MatrixIndexT num_cols_; /// < Number of columns MatrixIndexT num_rows_; /// < Number of rows - /** True number of columns for the internal matrix. This number may differ - * from num_cols_ as memory alignment might be used. */ - MatrixIndexT stride_; + MatrixIndexT row_stride_; ///< Row stride (distance in memory between one + ///< row and the next). Expected to + ///< satisfy abs(row_stride_) >= abs(col_stride_) + ///< (although this won't lead to wrong operation + ///< so we don't check this); + ///< and the matrix must have the property + ///< that no element can be accessed via + ///< two different pairs of indexes. + MatrixIndexT col_stride_; ///< Column stride (distance in memory between + ///< one column and the next). Normally + ///< expected to equal 1. + private: KALDI_DISALLOW_COPY_AND_ASSIGN(MatrixBase); }; diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc index c8ea35112ea..4756abda94e 100644 --- a/src/matrix/kaldi-vector.cc +++ b/src/matrix/kaldi-vector.cc @@ -152,19 +152,20 @@ template void VectorBase::MulTp(const TpMatrix &M, const MatrixTransposeType trans) { KALDI_ASSERT(M.NumRows() == dim_); - cblas_Xtpmv(trans,M.Data(),M.NumRows(),data_,1); + cblas_Xtpmv(trans, M.Data(), M.NumRows(), data_, stride_); } template void VectorBase::Solve(const TpMatrix &M, const MatrixTransposeType trans) { KALDI_ASSERT(M.NumRows() == dim_); - cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, 1); + cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, stride_); } template inline void Vector::Init(const MatrixIndexT dim) { + stride_ = 1; KALDI_ASSERT(dim >= 0); if (dim == 0) { this->dim_ = 0; @@ -188,7 +189,6 @@ inline void Vector::Init(const MatrixIndexT dim) { template void Vector::Resize(const MatrixIndexT dim, MatrixResizeType resize_type) { - // the next block uses recursion to handle what we have to do if // resize_type == kCopyData. if (resize_type == kCopyData) { @@ -244,12 +244,6 @@ template void VectorBase::CopyFromPacked(const PackedMatrix &other template void VectorBase::CopyFromPacked(const PackedMatrix &other); template void VectorBase::CopyFromPacked(const PackedMatrix &other); -/// Load data into the vector -template -void VectorBase::CopyFromPtr(const Real *data, MatrixIndexT sz) { - KALDI_ASSERT(dim_ == sz); - std::memcpy(this->data_, data, Dim() * sizeof(Real)); -} template template @@ -264,7 +258,7 @@ void VectorBase::CopyFromVec(const VectorBase &other) { template void VectorBase::CopyFromVec(const VectorBase &other); template void VectorBase::CopyFromVec(const VectorBase &other); -// Remove element from the vector. The vector is non reallocated +// Remove element from the vector. The vector is not reallocated template void Vector::RemoveElement(MatrixIndexT i) { KALDI_ASSERT(i < this->dim_ && "Access out of vector"); diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h index 383d8ca2862..2a50ae2f2ce 100644 --- a/src/matrix/kaldi-vector.h +++ b/src/matrix/kaldi-vector.h @@ -62,8 +62,19 @@ class VectorBase { /// Returns the dimension of the vector. inline MatrixIndexT Dim() const { return dim_; } - /// Returns the size in memory of the vector, in bytes. - inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); } + /// Returns the stride betwen elements of the vector; will normally be 1, and + /// must be nonzero. CAUTION: we are in the process of updating this library + /// to support vector strides, so stride != 1 may not be supported everywhere, + /// and may sometimes lead to unexpected behavior or crashes. + inline MatrixIndexT Stride() const { return stride_; } + + /// Returns the size in memory of the vector, in bytes, assuming + /// stride is 1 (if not, this doesn't make sense in the contexts + /// in which this is called. TODO: get rid of this + inline MatrixIndexT SizeInBytes() const { + KALDI_ASSERT(stride_ == 1); + return (dim_*sizeof(Real)); + } /// Returns a pointer to the start of the vector's data. inline Real* Data() { return data_; } @@ -75,14 +86,14 @@ class VectorBase { inline Real operator() (MatrixIndexT i) const { KALDI_PARANOID_ASSERT(static_cast(i) < static_cast(dim_)); - return *(data_ + i); + return *(data_ + i * stride_); } /// Indexing operator (non-const). inline Real & operator() (MatrixIndexT i) { KALDI_PARANOID_ASSERT(static_cast(i) < static_cast(dim_)); - return *(data_ + i); + return *(data_ + i * stride_); } /** @brief Returns a sub-vector of a vector (a range of elements). @@ -360,25 +371,18 @@ class VectorBase { ~VectorBase() {} /// Empty initializer, corresponds to vector of zero size. - explicit VectorBase(): data_(NULL), dim_(0) { + explicit VectorBase(): data_(NULL), dim_(0), stride_(1) { KALDI_ASSERT_IS_FLOATING_TYPE(Real); } -// Took this out since it is not currently used, and it is possible to create -// objects where the allocated memory is not the same size as dim_ : Arnab -// /// Initializer from a pointer and a size; keeps the pointer internally -// /// (ownership or non-ownership depends on the child class). -// explicit VectorBase(Real* data, MatrixIndexT dim) -// : data_(data), dim_(dim) {} - - // Arnab : made this protected since it is unsafe too. - /// Load data into the vector: sz must match own size. - void CopyFromPtr(const Real* Data, MatrixIndexT sz); /// data memory area Real* data_; /// dimension of vector MatrixIndexT dim_; + /// stride between elements of the vector. Would normally be 1. Must be + /// > 0 (if the vector is nonempty). + MatrixIndexT stride_; KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase); }; // class VectorBase @@ -484,17 +488,24 @@ class Vector: public VectorBase { template class SubVector : public VectorBase { public: - /// Constructor from a Vector or SubVector. - /// SubVectors are not const-safe and it's very hard to make them - /// so for now we just give up. This function contains const_cast. - SubVector(const VectorBase &t, const MatrixIndexT origin, - const MatrixIndexT length) : VectorBase() { - // following assert equiv to origin>=0 && length>=0 && - // origin+length <= rt.dim_ - KALDI_ASSERT(static_cast(origin)+ - static_cast(length) <= - static_cast(t.Dim())); - VectorBase::data_ = const_cast (t.Data()+origin); + /** + Constructor from a Vector or SubVector. + SubVectors are not const-safe and it's very hard to make them + so for now we just give up. This function contains const_cast. + @param [in] src The vector we are taking a sub-vector of + @param [in] begin The first element in 'src' + @param [in] num_elements The number of elements we are taking + @param [in] step The step between elements from 'src'; must be + >0. + */ + SubVector(const VectorBase &src, + const MatrixIndexT begin, + const MatrixIndexT num_elements, + const MatrixIndexT step = 1) : VectorBase() { + KALDI_ASSERT(stride > 0 && static_cast(src)+ + static_cast((num_elements - 1) * step) + < static_cast(t.Dim())); + VectorBase::data_ = const_cast (t.Data()+src); VectorBase::dim_ = length; } diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h new file mode 100644 index 00000000000..e94b6978a6b --- /dev/null +++ b/src/tensor/tensor.h @@ -0,0 +1,187 @@ +/** + This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out. +*/ + +namespace kaldi { +namespace tensor { + + +enum { + kCpuDevice = 0, + kCudaDevice = 1 +} DeviceType; + +// We may later add a device number (like which GPU we are using), +// once we support multiple GPUs. +struct Device { + DeviceType device_type; + // operator ==, probably, maybe constructors. +}; + + +// 'Storage' contains a single allocated region (on CPU or GPU, according +// to 'device'). +struct Storage { + void *data; + size_t num_bytes; + Device device; + + // Note: will throw if allocation fails (for now). + Storage(Device device, size_t num_bytes); + + // Destructor deallocates 'data'. For now there is no + // concept of a custom allocator or an allocator object, we just use our CuDevice stuff for cuda + // allocation and posix_memalign for CPU allocation (obviously we need + // to make sure 'data' is aligned in most specific way we might need). + // in future we might choose + // to add that. + ~Storage(); +}; + + +enum { + kFloatDtype = 0, + kDoubleDtype = 1 +} DataType; + +#define KALDI_TENSOR_MAX_DIM 5 + + + +/* + This struct stores the dimension and strides of a Tensor. The following + describes the properties that a Tensor will always have (note: we + also use TensorDim inside implementation code in ways such that these + properties do not all hold). + + These properties are stricter than some other frameworks, such as PyTorch, + which allow the users to manually add dimensions with stride 0 (and dim>1) so + that a lower-dimensional quantity can masquerade as one with a higher + dimension. We require that it never be possible to access the same + memory location using two different tuples of indexes. We also + don't allow zero dims (i.e. a tensor must not be empty); if you want an + empty Tensor, just use a null pointer. + + 0 <= num_axes <= 5 + for 0 <= axis < num_axes: + dims[i] > 0 + + The strides may take any value, including zero or negative, as long as the + uniqueness property is satisfied (i.e. must not be possible to access the + same memory location using two different tuples of indices. + +*/ + +struct TensorDim { + + int64_t num_axes; + int64_t dims[KALDI_TENSOR_MAX_DIM]; + int64_t strides[KALDI_TENSOR_MAX_DIM]; + // We may later add methods to this. + + // Checks that the TensorDim is valid, assuming it is part of a Tensor. + // I.e. that it satifies the properties mentioned above. + bool Check(); +}; + +struct TensorDimProperties { + // Below are cached properties that depend on a TensorDim. + + // The number of elements in the Tensor, which equals the product + // of dims[0] .. dims[num_axes - 1]. Must always be >0. + int64_t num_elements; + + // is_contiguous means that the data form a contiguous block in memory; it is + // not the same as PyTorch's is_contiguous which is a stronger condition; our + // has_expected_strides is equivalent to that. + bool is_contiguous; + + // has_expected_strides means that the strides are as if this was a "c"-style + // multidimensional array, meaning that (using Python wrap-around indexing + // conventions as if strides was an array of dimension 'num_axes'), + // strides[-1] == 1, strides[-1] == dims[-1], strides[-2] = dims[-1] * + // dims[-1], and so on. This is the same as PyTorch's is_contiguous. + bool has_expected_strides; + + void UpdateProperties(const TensorDim &dim); +}; + + + +class Tensor { + public: + // ... + + private: + // The tensor dim and strides. + TensorDim dim_; + // Cached properties that depend on dim_. + TensorDimProperties derived_; + // The data-type of this tensor. + DataType dtype_; + + // The raw data pointer + void *data_; + + // The storage region where the data resides. data_ does not necessarily + // equal storage_->data; it may be more than that, e.g. if this is a view + // to part of another Tensor. + std::shared_ptr storage_; + + +}; + +/* + This is the 'gradient information' that class Variable stores for a Tensor + when it is initialized with requires_grad = true (or is a result of + an operation on Variables one of which had requires_grad = true). + This does not give you access to the underlying Variables; doing it + like this makes reference counting easier (no loops). The GradFunc + will store any pointers to the original Variable that it may have + needed. + + Users will rarely need to interact directly with this struct directly. + */ +struct TensorGrad { + // The gradients corresponding to the input variables, which + // we may need to update. Some subset of these may be nullptr, + // corresponding to input Variables for which no gradient + // was required. + std::vector > inputs; + + // is_view is + bool is_view{false}; + + // The device we + Device device; + + // The dimension of the Tensor for which this is the gradient. Used + // to set up 'grad' when needed. + TensorDim dim; + + // 'offset' is only inspected if this is a view; it is the offset + // (in elements) from the + // 'inputs' will just contain one member, which is the gradient for the source + // Variable, and we use 'dim' and 'offset' to construct the sub-tensor). + int64_t offset; + + // This stores the gradient (if we already have one), or nullptr if not. + std::unique_ptr grad{nullptr}; + + +}; + + +class Variable { + using GradFunc = std::function< + void(std::vector& inputs, const Variable& grad_output)>; + + +}; + +typedef std::unique_ptr + + + + +}; From 0688a87aef8fa7d6f3b44d004b613f300e8bf359 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 15 Mar 2019 14:20:10 -0400 Subject: [PATCH 2/4] [src] removing matrix column stride (not supported by BLAS); various fixes --- src/matrix/kaldi-matrix.cc | 112 ++++++++++++++++++------------------- src/matrix/kaldi-matrix.h | 71 +++++++++++------------ src/matrix/kaldi-vector.cc | 77 ++++++++++--------------- src/matrix/kaldi-vector.h | 33 ++++++++--- src/matrix/sp-matrix.cc | 7 ++- 5 files changed, 145 insertions(+), 155 deletions(-) diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc index 48481f5f19e..d70ac5cefc8 100644 --- a/src/matrix/kaldi-matrix.cc +++ b/src/matrix/kaldi-matrix.cc @@ -117,10 +117,10 @@ template<> template<> void MatrixBase::AddVecVec(const float alpha, const VectorBase &a, - const VectorBase &rb) { - KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_); - cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(), - 1, data_, stride_); + const VectorBase &b) { + KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_); + cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), a.Stride(), + b.Data(), b.Stride(), data_, stride_); } template @@ -132,15 +132,18 @@ void MatrixBase::AddVecVec(const Real alpha, if (num_rows_ * num_cols_ > 100) { // It's probably worth it to allocate // temporary vectors of the right type and use BLAS. Vector temp_a(a), temp_b(b); - cblas_Xger(num_rows_, num_cols_, alpha, temp_a.Data(), 1, - temp_b.Data(), 1, data_, stride_); + cblas_Xger(num_rows_, num_cols_, alpha, + temp_a.Data(), temp_a.Stride(), + temp_b.Data(), temp_b.Stride(), + data_, stride_); } else { const OtherReal *a_data = a.Data(), *b_data = b.Data(); + MatrixIndexT a_stride = a.Stride(), b_stride = b.Stride(); Real *row_data = data_; for (MatrixIndexT i = 0; i < num_rows_; i++, row_data += stride_) { - BaseFloat alpha_ai = alpha * a_data[i]; + BaseFloat alpha_ai = alpha * a_data[i * a_stride]; for (MatrixIndexT j = 0; j < num_cols_; j++) - row_data[j] += alpha_ai * b_data[j]; + row_data[j] += alpha_ai * b_data[j * b_stride]; } } } @@ -159,11 +162,11 @@ template<> template<> void MatrixBase::AddVecVec(const double alpha, const VectorBase &a, - const VectorBase &rb) { - KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_); + const VectorBase &b) { + KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_); if (num_rows_ == 0) return; - cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(), - 1, data_, stride_); + cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), a.Stride(), + b.Data(), b.Stride(), data_, stride_); } template @@ -591,8 +594,10 @@ void MatrixBase::AddDiagVecMat( if (transM == kTrans) std::swap(M_row_stride, M_col_stride); Real *data = data_; const Real *Mdata = M.Data(), *vdata = v.Data(); + MatrixIndexT v_stride = v.Stride(); if (num_rows_ == 0) return; - for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata += M_row_stride, vdata++) + for (MatrixIndexT i = 0; i < num_rows; + i++, data += stride, Mdata += M_row_stride, vdata += v_stride) cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1); } @@ -623,10 +628,11 @@ void MatrixBase::AddMatDiagVec( Real *data = data_; const Real *Mdata = M.Data(), *vdata = v.Data(); + MatrixIndexT v_stride = v.Stride(); if (num_rows_ == 0) return; for (MatrixIndexT i = 0; i < num_rows; i++){ for(MatrixIndexT j = 0; j < num_cols; j ++ ){ - data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride + j*M_col_stride]; + data[i*stride + j] += alpha * vdata[j * v_stride] * Mdata[i*M_row_stride + j*M_col_stride]; } } } @@ -658,7 +664,8 @@ void MatrixBase::AddMatMatElements(const Real alpha, template void MatrixBase::LapackGesvd(VectorBase *s, MatrixBase *U_in, MatrixBase *V_in) { - KALDI_ASSERT(s != NULL && U_in != this && V_in != this); + KALDI_ASSERT(s != NULL && U_in != this && V_in != this && + s->Stride() == 1); Matrix tmpU, tmpV; if (U_in == NULL) tmpU.Resize(this->num_rows_, 1); // work-space if U_in empty. @@ -786,13 +793,12 @@ inline void Matrix::Init(const MatrixIndexT rows, KALDI_ASSERT(rows == 0 && cols == 0); this->num_rows_ = 0; this->num_cols_ = 0; - this->row_stride_ = 0; - this->col_stride_ = 0; + this->stride_ = 0; this->data_ = NULL; return; } KALDI_ASSERT(rows > 0 && cols > 0); - MatrixIndexT skip, row_stride; + MatrixIndexT skip, stride; size_t size; void *data; // aligned memory block void *temp; // memory block to be really freed @@ -800,8 +806,8 @@ inline void Matrix::Init(const MatrixIndexT rows, // compute the size of skip and real cols skip = ((16 / sizeof(Real)) - cols % (16 / sizeof(Real))) % (16 / sizeof(Real)); - row_stride = cols + skip; - size = static_cast(rows) * static_cast(row_stride) + stride = cols + skip; + size = static_cast(rows) * static_cast(stride) * sizeof(Real); // allocate the memory and set the right dimensions and parameters @@ -809,8 +815,7 @@ inline void Matrix::Init(const MatrixIndexT rows, MatrixBase::data_ = static_cast (data); MatrixBase::num_rows_ = rows; MatrixBase::num_cols_ = cols; - MatrixBase::row_stride_ = (stride_type == kDefaultStride ? stride : cols); - MatrixBase::col_stride_ = 1; + MatrixBase::stride_ = (stride_type == kDefaultStride ? stride : cols); } else { throw std::bad_alloc(); } @@ -826,7 +831,7 @@ void Matrix::Resize(const MatrixIndexT rows, if (resize_type == kCopyData) { if (this->data_ == NULL || rows == 0) resize_type = kSetZero; // nothing to copy. else if (rows == this->num_rows_ && cols == this->num_cols_ && - (stride_type == kDefaultStride || this->row_stride_ == this->num_cols_)) { return; } // nothing to do. + (stride_type == kDefaultStride || this->stride_ == this->num_cols_)) { return; } // nothing to do. else { // set tmp to a matrix of the desired size; if new matrix // is bigger in some dimension, zero it. @@ -876,14 +881,12 @@ void MatrixBase::CopyFromMat(const MatrixBase &M, (*this).Row(i).CopyFromVec(M.Row(i)); } else { KALDI_ASSERT(num_cols_ == M.NumRows() && num_rows_ == M.NumCols()); - int32 this_row_stride = row_stride_, this_col_stride = col_stride_, - other_row_stride = M.RowStride(), other_col_stride = M.ColStride(); + int32 this_stride = stride_, other_stride = M.Stride(); Real *this_data = data_; const OtherReal *other_data = M.Data(); for (MatrixIndexT i = 0; i < num_rows_; i++) for (MatrixIndexT j = 0; j < num_cols_; j++) - this_data[i * this_row_stride + j * this_col_stride] = - other_data[j * other_row_stride + i * other_col_stride]; + this_data[i * this_stride + j] = other_data[j * other_stride + i]; } } @@ -906,17 +909,15 @@ template<> template<> void MatrixBase::CopyFromSp(const SpMatrix & M) { KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); - MatrixIndexT num_rows = num_rows_, - row_stride = row_stride_, - col_stride = col_stride_; + MatrixIndexT num_rows = num_rows_, stride = stride_; const float *Mdata = M.Data(); float *row_data = data_, *col_data = data_; for (MatrixIndexT i = 0; i < num_rows; i++) { - cblas_scopy(i + 1, Mdata, 1, row_data, col_stride); // copy to the row. - cblas_scopy(i, Mdata, 1, col_data, row_stride); // copy to the column. - Mdata += i + 1; - row_data += row_stride; - col_data += col_stride; + cblas_scopy(i+1, Mdata, 1, row_data, 1); // copy to the row. + cblas_scopy(i, Mdata, 1, col_data, stride); // copy to the column. + Mdata += i+1; + row_data += stride; + col_data += 1; } } @@ -925,17 +926,15 @@ template<> template<> void MatrixBase::CopyFromSp(const SpMatrix & M) { KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); - MatrixIndexT num_rows = num_rows_, - row_stride = row_stride_, - col_stride = col_stride_; + MatrixIndexT num_rows = num_rows_, stride = stride_; const double *Mdata = M.Data(); double *row_data = data_, *col_data = data_; for (MatrixIndexT i = 0; i < num_rows; i++) { - cblas_dcopy(i+1, Mdata, 1, row_data, col_stride); // copy to the row. - cblas_dcopy(i, Mdata, 1, col_data, row_stride); // copy to the column. + cblas_dcopy(i+1, Mdata, 1, row_data, 1); // copy to the row. + cblas_dcopy(i, Mdata, 1, col_data, stride); // copy to the column. Mdata += i+1; - row_data += row_stride; - col_data += col_stride; + row_data += stride; + col_data += 1; } } @@ -964,26 +963,24 @@ template template void MatrixBase::CopyFromTp(const TpMatrix & M, MatrixTransposeType Trans) { - MatrixIndexT row_stride = row_stride_, col_stride = col_stride_; if (Trans == kNoTrans) { KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); SetZero(); Real *out_i = data_; const OtherReal *in_i = M.Data(); - for (MatrixIndexT i = 0; i < num_rows_; - i++, out_i += row_stride_, in_i += i) { + for (MatrixIndexT i = 0; i < num_rows_; i++, out_i += stride_, in_i += i) { for (MatrixIndexT j = 0; j <= i; j++) - out_i[j * col_stride] = in_i[j]; + out_i[j] = in_i[j]; } } else { SetZero(); KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); + MatrixIndexT stride = stride_; Real *out_i = data_; const OtherReal *in_i = M.Data(); - for (MatrixIndexT i = 0; i < num_rows_; - i++, out_i += col_stride, in_i += i) { + for (MatrixIndexT i = 0; i < num_rows_; i++, out_i ++, in_i += i) { for (MatrixIndexT j = 0; j <= i; j++) - out_i[j * row_stride] = in_i[j]; + out_i[j*stride] = in_i[j]; } } } @@ -1004,7 +1001,7 @@ void MatrixBase::CopyFromTp(const TpMatrix & M, template void MatrixBase::CopyRowsFromVec(const VectorBase &rv) { - if (rv.Dim() == num_rows_ * num_cols_) { + if (rv.Dim() == num_rows_*num_cols_) { if (stride_ == num_cols_) { // one big copy operation. const Real *rv_data = rv.Data(); @@ -1796,7 +1793,7 @@ void MatrixBase::DestructiveSvd(VectorBase *s, MatrixBase *U, // Throws exception on error. KALDI_ASSERT(num_rows_>=num_cols_ && "Svd requires that #rows by >= #cols."); // For compatibility with JAMA code. - KALDI_ASSERT(s->Dim() == num_cols_); // s should be the smaller dim. + KALDI_ASSERT(s->Dim() == num_cols_ && s->Stride() == 1); // s should be the smaller dim. KALDI_ASSERT(U == NULL || (U->num_rows_ == num_rows_&&U->num_cols_ == num_cols_)); KALDI_ASSERT(Vt == NULL || (Vt->num_rows_ == num_cols_&&Vt->num_cols_ == num_cols_)); @@ -2002,27 +1999,28 @@ void MatrixBase::OrthogonalizeRows() { // symmetric positive definite). template -void MatrixBase::SymPosSemiDefEig(VectorBase *rs, MatrixBase *rU, Real check_thresh) // e.g. check_thresh = 0.001 +void MatrixBase::SymPosSemiDefEig(VectorBase *s, MatrixBase *U, Real check_thresh) // e.g. check_thresh = 0.001 { const MatrixIndexT D = num_rows_; KALDI_ASSERT(num_rows_ == num_cols_); KALDI_ASSERT(IsSymmetric() && "SymPosSemiDefEig: expecting input to be symmetrical."); - KALDI_ASSERT(rU->num_rows_ == D && rU->num_cols_ == D && rs->Dim() == D); + KALDI_ASSERT(U->num_rows_ == D && U->num_cols_ == D && s->Dim() == D && + s->Stride() == 1); Matrix Vt(D, D); - Svd(rs, rU, &Vt); + Svd(s, U, &Vt); // First just zero any singular values if the column of U and V do not have +ve dot product-- // this may mean we have small negative eigenvalues, and if we zero them the result will be closer to correct. for (MatrixIndexT i = 0;i < D;i++) { Real sum = 0.0; - for (MatrixIndexT j = 0;j < D;j++) sum += (*rU)(j, i) * Vt(i, j); - if (sum < 0.0) (*rs)(i) = 0.0; + for (MatrixIndexT j = 0;j < D;j++) sum += (*U)(j, i) * Vt(i, j); + if (sum < 0.0) (*s)(i) = 0.0; } { - Matrix tmpU(*rU); Vector tmps(*rs); tmps.ApplyPow(0.5); + Matrix tmpU(*U); Vector tmps(*s); tmps.ApplyPow(0.5); tmpU.MulColsVec(tmps); SpMatrix tmpThis(D); tmpThis.AddMat2(1.0, tmpU, kNoTrans, 0.0); diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h index 9148f373c82..4b06a22ece9 100644 --- a/src/matrix/kaldi-matrix.h +++ b/src/matrix/kaldi-matrix.h @@ -59,25 +59,20 @@ class MatrixBase { friend class SparseMatrix; friend class SparseMatrix; - /// Returns number of rows (or zero for emtpy matrix). + /// Returns number of rows (or zero for empty matrix). inline MatrixIndexT NumRows() const { return num_rows_; } /// Returns number of columns (or zero for emtpy matrix). inline MatrixIndexT NumCols() const { return num_cols_; } - /// Stride() is deprecated - inline MatrixIndexT Stride() const { return row_stride_; } - - /// The distance in memory between successive rows. Not required to be - /// positive or even nonzero, as long you can't get to the same - /// memory location using different indexes. - inline MatrixIndexT RowStride() const { return row_stride_; } - - /// The distance in memory between successive columns; will normally - /// be 1 but it may be negative or even zero as long as you - /// can't get to the same memory location using differen indexes. - inline MatrixIndexT ColStride() const { return col_stride_; } + /// Stride (distance in memory between each row). Must be >= NumCols(). + inline MatrixIndexT Stride() const { return stride_; } + /// Returns size in bytes of the data held by the matrix. + size_t SizeInBytes() const { + return static_cast(num_rows_) * static_cast(stride_) * + sizeof(Real); + } /// Gives pointer to raw data (const). inline const Real* Data() const { @@ -87,19 +82,18 @@ class MatrixBase { /// Gives pointer to raw data (non-const). inline Real* Data() { return data_; } - /// Returns pointer to data for one row (non-const). - /// Caution: don't assume ColumnStride() is 1. + /// Returns pointer to data for one row (non-const) inline Real* RowData(MatrixIndexT i) { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return data_ + i * row_stride_; + return data_ + i * stride_; } /// Returns pointer to data for one row (const) inline const Real* RowData(MatrixIndexT i) const { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return data_ + i * row_stride_; + return data_ + i * stride_; } /// Indexing operator, non-const @@ -109,7 +103,7 @@ class MatrixBase { static_cast(num_rows_) && static_cast(c) < static_cast(num_cols_)); - return *(data_ + r * row_stride_ + c * col_stride_); + return *(data_ + r * stride_ + c); } /// Indexing operator, provided for ease of debugging (gdb doesn't work /// with parenthesis operator). @@ -122,7 +116,7 @@ class MatrixBase { static_cast(num_rows_) && static_cast(c) < static_cast(num_cols_)); - return *(data_ + r * row_stride_ + c * col_stride_); + return *(data_ + r * stride_ + c); } /* Basic setting-to-special values functions. */ @@ -189,18 +183,20 @@ class MatrixBase { /* Accessing of sub-parts of the matrix. */ - /// Return specific row of matrix [const]. - inline const SubVector Row(MatrixIndexT i) const { + /// Return specific row of matrix. Warning: this can get + /// around const constraints. + inline SubVector Row(MatrixIndexT i) const { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return SubVector(data_ + (i * stride_), NumCols()); + return SubVector(data_ + (i * stride_), num_cols_); } - /// Return specific row of matrix. - inline SubVector Row(MatrixIndexT i) { + /// Return specific column of matrix. Warning: this can get + /// around const constraints. + inline const SubVector Col(MatrixIndexT i) const { KALDI_ASSERT(static_cast(i) < - static_cast(num_rows_)); - return SubVector(data_ + (i * stride_), NumCols()); + static_cast(num_cols_)); + return SubVector(data_ + i, num_rows_, stride_); } /// Return a sub-part of matrix. @@ -412,7 +408,9 @@ class MatrixBase { Null pointers for U and/or Vt at input mean we do not want that output. We expect that S.Dim() == m, U is either NULL or m by n, and v is either NULL or n by n. - The singular values are not sorted (use SortSvd for that). */ + The singular values are not sorted (use SortSvd for that). + Requires that s->Stride() == 1. + */ void DestructiveSvd(VectorBase *s, MatrixBase *U, MatrixBase *Vt); // Destroys calling matrix. @@ -420,6 +418,7 @@ class MatrixBase { /// transposed; the normal formulation is U diag(s) V^T. /// Null pointers for U or V mean we don't want that output (this saves /// compute). The singular values are not sorted (use SortSvd for that). + /// Requires that s->Stride() == 1. void Svd(VectorBase *s, MatrixBase *U, MatrixBase *Vt) const; /// Compute SVD but only retain the singular values. @@ -537,6 +536,7 @@ class MatrixBase { * positive semi-definite (check_thresh controls how stringent the check is; * set it to 2 to ensure it won't ever complain, but it will zero out negative * dimensions in your matrix. + * Requires s->Stride() == 1. */ void SymPosSemiDefEig(VectorBase *s, MatrixBase *P, Real check_thresh = 0.001); @@ -769,20 +769,13 @@ class MatrixBase { /// data memory area Real* data_; + /// these atributes store the real matrix size as it is stored in memory + /// including memalignment MatrixIndexT num_cols_; /// < Number of columns MatrixIndexT num_rows_; /// < Number of rows - MatrixIndexT row_stride_; ///< Row stride (distance in memory between one - ///< row and the next). Expected to - ///< satisfy abs(row_stride_) >= abs(col_stride_) - ///< (although this won't lead to wrong operation - ///< so we don't check this); - ///< and the matrix must have the property - ///< that no element can be accessed via - ///< two different pairs of indexes. - MatrixIndexT col_stride_; ///< Column stride (distance in memory between - ///< one column and the next). Normally - ///< expected to equal 1. - + /** True number of columns for the internal matrix. This number may differ + * from num_cols_ as memory alignment might be used. */ + MatrixIndexT stride_; private: KALDI_DISALLOW_COPY_AND_ASSIGN(MatrixBase); }; diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc index 4756abda94e..655945bd01b 100644 --- a/src/matrix/kaldi-vector.cc +++ b/src/matrix/kaldi-vector.cc @@ -38,7 +38,7 @@ Real VecVec(const VectorBase &a, const VectorBase &b) { MatrixIndexT adim = a.Dim(); KALDI_ASSERT(adim == b.Dim()); - return cblas_Xdot(adim, a.Data(), 1, b.Data(), 1); + return cblas_Xdot(adim, a.Data(), a.Stride(), b.Data(), b.Stride()); } template @@ -76,7 +76,7 @@ void VectorBase::AddVec(const float alpha, const VectorBase &v) { KALDI_ASSERT(dim_ == v.dim_); KALDI_ASSERT(&v != this); - cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1); + cblas_Xaxpy(dim_, alpha, v.Data(), v.stride_, data_, stride_); } template<> @@ -85,7 +85,7 @@ void VectorBase::AddVec(const double alpha, const VectorBase &v) { KALDI_ASSERT(dim_ == v.dim_); KALDI_ASSERT(&v != this); - cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1); + cblas_Xaxpy(dim_, alpha, v.Data(), v.stride_, data_, stride_); } template @@ -98,7 +98,7 @@ void VectorBase::AddMatVec(const Real alpha, || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_)); KALDI_ASSERT(&v != this); cblas_Xgemv(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(), - v.Data(), 1, beta, data_, 1); + v.Data(), v.stride_, beta, data_, stride_); } template @@ -111,40 +111,19 @@ void VectorBase::AddMatSvec(const Real alpha, || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_)); KALDI_ASSERT(&v != this); Xgemv_sparsevec(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(), - v.Data(), 1, beta, data_, 1); + v.Data(), v.stride_, beta, data_, stride_); return; - /* - MatrixIndexT this_dim = this->dim_, v_dim = v.dim_, - M_stride = M.Stride(); - Real *this_data = this->data_; - const Real *M_data = M.Data(), *v_data = v.data_; - if (beta != 1.0) this->Scale(beta); - if (trans == kNoTrans) { - for (MatrixIndexT i = 0; i < v_dim; i++) { - Real v_i = v_data[i]; - if (v_i == 0.0) continue; - // Add to *this, the i'th column of the Matrix, times v_i. - cblas_Xaxpy(this_dim, v_i * alpha, M_data + i, M_stride, this_data, 1); - } - } else { // The transposed case is slightly more efficient, I guess. - for (MatrixIndexT i = 0; i < v_dim; i++) { - Real v_i = v.data_[i]; - if (v_i == 0.0) continue; - // Add to *this, the i'th row of the Matrix, times v_i. - cblas_Xaxpy(this_dim, v_i * alpha, - M_data + (i * M_stride), 1, this_data, 1); - } - }*/ } template void VectorBase::AddSpVec(const Real alpha, - const SpMatrix &M, - const VectorBase &v, - const Real beta) { + const SpMatrix &M, + const VectorBase &v, + const Real beta) { KALDI_ASSERT(M.NumRows() == v.dim_ && dim_ == v.dim_); KALDI_ASSERT(&v != this); - cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), 1, beta, data_, 1); + cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), v.stride_, + beta, data_, stride_); } @@ -165,7 +144,7 @@ void VectorBase::Solve(const TpMatrix &M, template inline void Vector::Init(const MatrixIndexT dim) { - stride_ = 1; + this->stride_ = 1; KALDI_ASSERT(dim >= 0); if (dim == 0) { this->dim_ = 0; @@ -672,7 +651,7 @@ void VectorBase::CopyColFromMat(const MatrixBase &mat, MatrixInd template void VectorBase::CopyDiagFromMat(const MatrixBase &M) { KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols())); - cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1); + cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, stride_); } template @@ -689,7 +668,7 @@ Real VectorBase::Sum() const { // implement sum. This allows us to access SIMD operations in a // cross-platform way via your BLAS library. Real one(1); - return cblas_Xdot(dim_, data_, 1, &one, 0); + return cblas_Xdot(dim_, data_, stride_, &one, 0); } template @@ -712,15 +691,16 @@ Real VectorBase::SumLog() const { template void VectorBase::AddRowSumMat(Real alpha, const MatrixBase &M, Real beta) { KALDI_ASSERT(dim_ == M.NumCols()); - MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_; + MatrixIndexT num_rows = M.NumRows(), m_stride = M.Stride(), + this_stride = stride_, dim = dim_; Real *data = data_; // implement the function according to a dimension cutoff for computation efficiency if (num_rows <= 64) { - cblas_Xscal(dim, beta, data, 1); + cblas_Xscal(dim, beta, data, this_stride); const Real *m_data = M.Data(); - for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride) - cblas_Xaxpy(dim, alpha, m_data, 1, data, 1); + for (MatrixIndexT i = 0; i < num_rows; i++, m_data += m_stride) + cblas_Xaxpy(dim, alpha, m_data, 1, data, stride_); } else { Vector ones(M.NumRows()); @@ -773,17 +753,19 @@ Real VectorBase::LogSumExp(Real prune) const { template void VectorBase::InvertElements() { - for (MatrixIndexT i = 0; i < dim_; i++) { - data_[i] = static_cast(1 / data_[i]); + MatrixIndexT dim = dim_, stride = stride_; + for (MatrixIndexT i = 0; i < dim; i++) { + data_[i * stride] = static_cast(1) / data_[i * stride]; } } template void VectorBase::ApplyLog() { - for (MatrixIndexT i = 0; i < dim_; i++) { - if (data_[i] < 0.0) + MatrixIndexT dim = dim_, stride = stride_; + for (MatrixIndexT i = 0; i < dim; i++) { + if (data_[i * stride] < 0.0) KALDI_ERR << "Trying to take log of a negative number."; - data_[i] = Log(data_[i]); + data_[i * stride] = Log(data_[i * stride]); } } @@ -954,7 +936,7 @@ void VectorBase::Add(Real c) { template void VectorBase::Scale(Real alpha) { - cblas_Xscal(dim_, alpha, data_, 1); + cblas_Xscal(dim_, alpha, data_, stride_); } template @@ -995,8 +977,8 @@ void VectorBase::AddVecVec(Real alpha, const VectorBase &v, KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_); // We pretend that v is a band-diagonal matrix. KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_); - cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1, - r.data_, 1, beta, this->data_, 1); + cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, v.stride_, + r.data_, r.stride_, beta, this->data_, stride_); } @@ -1304,7 +1286,8 @@ void VectorBase::AddDiagMat2( Real *data = this->data_; const Real *mat_data = M.Data(); for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++) - *data = beta * *data + alpha * cblas_Xdot(cols,mat_data,1,mat_data,1); + *data = beta * *data + alpha * cblas_Xdot(cols, mat_data, 1, + mat_data, 1); } else { KALDI_ASSERT(this->dim_ == M.NumCols()); MatrixIndexT rows = M.NumRows(), cols = this->dim_, diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h index 2a50ae2f2ce..2a10b129ef5 100644 --- a/src/matrix/kaldi-vector.h +++ b/src/matrix/kaldi-vector.h @@ -502,11 +502,18 @@ class SubVector : public VectorBase { const MatrixIndexT begin, const MatrixIndexT num_elements, const MatrixIndexT step = 1) : VectorBase() { - KALDI_ASSERT(stride > 0 && static_cast(src)+ - static_cast((num_elements - 1) * step) - < static_cast(t.Dim())); - VectorBase::data_ = const_cast (t.Data()+src); - VectorBase::dim_ = length; + // Casting to UnsignedMatrixIndexT is a mechanism to test something + // is >= 0 as well as < x (for positive x) in a single comparison. + typedef UnsignedMatrixIndexT U; + KALDI_ASSERT( + step != 0 && + static_cast(begin) < static_cast(src.Dim()) && + static_cast(begin + step * (num_elements - 1)) < + static_cast(src.Dim())); + VectorBase::data_ = const_cast (src.Data() + + begin * src.Stride()); + VectorBase::dim_ = num_elements; + VectorBase::stride_ = step * src.Stride(); } /// This constructor initializes the vector to point at the contents @@ -514,6 +521,7 @@ class SubVector : public VectorBase { SubVector(const PackedMatrix &M) { VectorBase::data_ = const_cast (M.Data()); VectorBase::dim_ = (M.NumRows()*(M.NumRows()+1))/2; + VectorBase::stride_ = 1; } /// Copy constructor @@ -521,21 +529,28 @@ class SubVector : public VectorBase { // this copy constructor needed for Range() to work in base class. VectorBase::data_ = other.data_; VectorBase::dim_ = other.dim_; + VectorBase::stride_ = other.stride_; } - /// Constructor from a pointer to memory and a length. Keeps a pointer - /// to the data but does not take ownership (will never delete). - /// Caution: this constructor enables you to evade const constraints. - SubVector(const Real *data, MatrixIndexT length) : VectorBase () { + /// Constructor from a pointer to memory and a length, and an optional stride. + /// Keeps a pointer to the data but does not take ownership (will never + /// delete). Caution: this constructor enables you to evade const + /// constraints. + SubVector(const Real *data, MatrixIndexT length, MatrixIndexT stride = 1): + VectorBase () { VectorBase::data_ = const_cast(data); VectorBase::dim_ = length; + VectorBase::stride_ = stride; } /// This operation does not preserve const-ness, so be careful. + /// This function is somewhat deprecated, for being ambiguous + /// MatrixBase:Row() is probably preferred. SubVector(const MatrixBase &matrix, MatrixIndexT row) { VectorBase::data_ = const_cast(matrix.RowData(row)); VectorBase::dim_ = matrix.NumCols(); + VectorBase::stride_ = 1; } ~SubVector() {} ///< Destructor (does nothing; no pointers are owned here). diff --git a/src/matrix/sp-matrix.cc b/src/matrix/sp-matrix.cc index 224ef39fb6e..40511f537ef 100644 --- a/src/matrix/sp-matrix.cc +++ b/src/matrix/sp-matrix.cc @@ -180,16 +180,17 @@ Real SpMatrix::Trace() const { // diagonal update, this <-- this + diag(v) template template -void SpMatrix::AddDiagVec(const Real alpha, const VectorBase &v) { +void SpMatrix::AddDiagVec(const Real alpha, const VectorBase &v) { int32 num_rows = this->num_rows_; KALDI_ASSERT(num_rows == v.Dim() && num_rows > 0); const OtherReal *src = v.Data(); Real *dst = this->data_; + MatrixIndexT src_stride = v.Stride(); if (alpha == 1.0) - for (int32 i = 1; i <= num_rows; i++, src++, dst += i) + for (int32 i = 1; i <= num_rows; i++, src += src_stride, dst += i) *dst += *src; else - for (int32 i = 1; i <= num_rows; i++, src++, dst += i) + for (int32 i = 1; i <= num_rows; i++, src += src_stride, dst += i) *dst += alpha * *src; } From dda4479af692dd84175494294717edff95683976 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 15 Mar 2019 14:20:40 -0400 Subject: [PATCH 3/4] [src] some updates to kaldi-vector.cc RE column stride, etc. --- src/matrix/kaldi-vector.cc | 63 +++++++++++--------------------------- 1 file changed, 18 insertions(+), 45 deletions(-) diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc index 4756abda94e..fd6b76459ee 100644 --- a/src/matrix/kaldi-vector.cc +++ b/src/matrix/kaldi-vector.cc @@ -55,9 +55,10 @@ Real VecVec(const VectorBase &ra, KALDI_ASSERT(adim == rb.Dim()); const Real *a_data = ra.Data(); const OtherReal *b_data = rb.Data(); + MatrixIndexT a_stride = a.Stride(), b_stride = b.Stride(); Real sum = 0.0; for (MatrixIndexT i = 0; i < adim; i++) - sum += a_data[i]*b_data[i]; + sum += a_data[i * a_stride] * b_data[i * b_stride]; return sum; } @@ -111,30 +112,8 @@ void VectorBase::AddMatSvec(const Real alpha, || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_)); KALDI_ASSERT(&v != this); Xgemv_sparsevec(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(), - v.Data(), 1, beta, data_, 1); + v.Data(), v.Stride(), beta, data_, stride_); return; - /* - MatrixIndexT this_dim = this->dim_, v_dim = v.dim_, - M_stride = M.Stride(); - Real *this_data = this->data_; - const Real *M_data = M.Data(), *v_data = v.data_; - if (beta != 1.0) this->Scale(beta); - if (trans == kNoTrans) { - for (MatrixIndexT i = 0; i < v_dim; i++) { - Real v_i = v_data[i]; - if (v_i == 0.0) continue; - // Add to *this, the i'th column of the Matrix, times v_i. - cblas_Xaxpy(this_dim, v_i * alpha, M_data + i, M_stride, this_data, 1); - } - } else { // The transposed case is slightly more efficient, I guess. - for (MatrixIndexT i = 0; i < v_dim; i++) { - Real v_i = v.data_[i]; - if (v_i == 0.0) continue; - // Add to *this, the i'th row of the Matrix, times v_i. - cblas_Xaxpy(this_dim, v_i * alpha, - M_data + (i * M_stride), 1, this_data, 1); - } - }*/ } template @@ -144,7 +123,8 @@ void VectorBase::AddSpVec(const Real alpha, const Real beta) { KALDI_ASSERT(M.NumRows() == v.dim_ && dim_ == v.dim_); KALDI_ASSERT(&v != this); - cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), 1, beta, data_, 1); + cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), v.Stride(), beta, + data_, stride_); } @@ -672,14 +652,15 @@ void VectorBase::CopyColFromMat(const MatrixBase &mat, MatrixInd template void VectorBase::CopyDiagFromMat(const MatrixBase &M) { KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols())); - cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1); + cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, stride_); } template void VectorBase::CopyDiagFromPacked(const PackedMatrix &M) { KALDI_ASSERT(dim_ == M.NumCols()); - for (MatrixIndexT i = 0; i < dim_; i++) - data_[i] = M(i, i); + MatrixIndexT stride = stride_, dim = dim_; + for (MatrixIndexT i = 0; i < dim; i++) + data_[i * stride] = M(i, i); // could make this more efficient. } @@ -689,15 +670,16 @@ Real VectorBase::Sum() const { // implement sum. This allows us to access SIMD operations in a // cross-platform way via your BLAS library. Real one(1); - return cblas_Xdot(dim_, data_, 1, &one, 0); + return cblas_Xdot(dim_, data_, stride_, &one, 0); } template Real VectorBase::SumLog() const { double sum_log = 0.0; double prod = 1.0; + MatrixIndexT dim = dim_, stride = stride_; for (MatrixIndexT i = 0; i < dim_; i++) { - prod *= data_[i]; + prod *= data_[i * stride]; // Possible future work (arnab): change these magic values to pre-defined // constants if (prod < 1.0e-10 || prod > 1.0e+10) { @@ -710,23 +692,14 @@ Real VectorBase::SumLog() const { } template -void VectorBase::AddRowSumMat(Real alpha, const MatrixBase &M, Real beta) { +void VectorBase::AddRowSumMat(Real alpha, const MatrixBase &M, + Real beta) { KALDI_ASSERT(dim_ == M.NumCols()); - MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_; - Real *data = data_; - // implement the function according to a dimension cutoff for computation efficiency - if (num_rows <= 64) { - cblas_Xscal(dim, beta, data, 1); - const Real *m_data = M.Data(); - for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride) - cblas_Xaxpy(dim, alpha, m_data, 1, data, 1); - - } else { - Vector ones(M.NumRows()); - ones.Set(1.0); - this->AddMatVec(alpha, M, kTrans, ones, beta); - } + // treat 'one' as a vector with stride zero. + Real one(1); + cblas_Xgemv(kTrans, M.NumRows(), M.NumCols(), alpha, M.Data(), + M.Stride(), &one, 0, data_, stride_, beta, data_, stride_); } template From d32f22e4cc934f3c3de9abd005325bcb362f1180 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 15 Mar 2019 15:50:24 -0400 Subject: [PATCH 4/4] [egs] Revert unwanted change to cmd.sh --- egs/mini_librispeech/s5/cmd.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh index 223eb21c55d..71dd849a93b 100644 --- a/egs/mini_librispeech/s5/cmd.sh +++ b/egs/mini_librispeech/s5/cmd.sh @@ -10,6 +10,6 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="retry.pl --num-tries 3 queue.pl --mem 2G" -export decode_cmd="retry.pl --num-tries 3 queue.pl --mem 4G" -export mkgraph_cmd="retry.pl --num-tries 3 queue.pl --mem 8G" +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G"