From fc2423d27595bcd2d4aa3846212fcf7dba732f48 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 14 Mar 2019 20:58:26 -0400
Subject: [PATCH 1/4] [src] starting to refactor matrix/vector; drafting tensor
 stuff

---
 egs/mini_librispeech/s5/cmd.sh |   6 +-
 src/matrix/kaldi-matrix.cc     |  64 ++++++-----
 src/matrix/kaldi-matrix.h      |  47 ++++++---
 src/matrix/kaldi-vector.cc     |  14 +--
 src/matrix/kaldi-vector.h      |  63 ++++++-----
 src/tensor/tensor.h            | 187 +++++++++++++++++++++++++++++++++
 6 files changed, 298 insertions(+), 83 deletions(-)
 create mode 100644 src/tensor/tensor.h
diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh
index 71dd849a93b..223eb21c55d 100644
--- a/egs/mini_librispeech/s5/cmd.sh
+++ b/egs/mini_librispeech/s5/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 2G"
-export decode_cmd="queue.pl --mem 4G"
-export mkgraph_cmd="queue.pl --mem 8G"
+export train_cmd="retry.pl --num-tries 3 queue.pl --mem 2G"
+export decode_cmd="retry.pl --num-tries 3 queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl --num-tries 3 queue.pl --mem 8G"
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index fcfe0616b64..48481f5f19e 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -28,7 +28,7 @@
 #include "matrix/compressed-matrix.h"
 #include "matrix/sparse-matrix.h"
 
-static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans), 
+static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans),
     "kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!");
 
 namespace kaldi {
@@ -538,7 +538,7 @@ void MatrixBase<Real>::AddMatSmat(Real alpha, const MatrixBase<Real> &A,
         // pass stride to write a column as matrices are stored in row major order.
         cblas_Xaxpy(this_num_rows, alpha_B_jk, a_col_k, A.stride_,
                     this_col_j, this->stride_);
-        //for (MatrixIndexT i = 0; i < this_num_rows; ++i) 
+        //for (MatrixIndexT i = 0; i < this_num_rows; ++i)
         // this_col_j[i*this->stride_] +=  alpha_B_jk * a_col_k[i*A.stride_];
       }
     }
@@ -786,12 +786,13 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
     KALDI_ASSERT(rows == 0 && cols == 0);
     this->num_rows_ = 0;
     this->num_cols_ = 0;
-    this->stride_ = 0;
+    this->row_stride_ = 0;
+    this->col_stride_ = 0;
     this->data_ = NULL;
     return;
   }
   KALDI_ASSERT(rows > 0 && cols > 0);
-  MatrixIndexT skip, stride;
+  MatrixIndexT skip, row_stride;
   size_t size;
   void *data;  // aligned memory block
   void *temp;  // memory block to be really freed
@@ -799,8 +800,8 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
   // compute the size of skip and real cols
   skip = ((16 / sizeof(Real)) - cols % (16 / sizeof(Real)))
       % (16 / sizeof(Real));
-  stride = cols + skip;
-  size = static_cast<size_t>(rows) * static_cast<size_t>(stride)
+  row_stride = cols + skip;
+  size = static_cast<size_t>(rows) * static_cast<size_t>(row_stride)
       * sizeof(Real);
 
   // allocate the memory and set the right dimensions and parameters
@@ -808,7 +809,8 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
     MatrixBase<Real>::data_        = static_cast<Real *> (data);
     MatrixBase<Real>::num_rows_      = rows;
     MatrixBase<Real>::num_cols_      = cols;
-    MatrixBase<Real>::stride_  = (stride_type == kDefaultStride ? stride : cols);
+    MatrixBase<Real>::row_stride_  = (stride_type == kDefaultStride ? stride : cols);
+    MatrixBase<Real>::col_stride_  = 1;
   } else {
     throw std::bad_alloc();
   }
@@ -824,7 +826,7 @@ void Matrix<Real>::Resize(const MatrixIndexT rows,
   if (resize_type == kCopyData) {
     if (this->data_ == NULL || rows == 0) resize_type = kSetZero;  // nothing to copy.
     else if (rows == this->num_rows_ && cols == this->num_cols_ &&
-	     (stride_type == kDefaultStride || this->stride_ == this->num_cols_)) { return; } // nothing to do.
+	     (stride_type == kDefaultStride || this->row_stride_ == this->num_cols_)) { return; } // nothing to do.
     else {
       // set tmp to a matrix of the desired size; if new matrix
       // is bigger in some dimension, zero it.
@@ -874,12 +876,14 @@ void MatrixBase<Real>::CopyFromMat(const MatrixBase<OtherReal> &M,
       (*this).Row(i).CopyFromVec(M.Row(i));
   } else {
     KALDI_ASSERT(num_cols_ == M.NumRows() && num_rows_ == M.NumCols());
-    int32 this_stride = stride_, other_stride = M.Stride();
+    int32 this_row_stride = row_stride_, this_col_stride = col_stride_,
+        other_row_stride = M.RowStride(), other_col_stride = M.ColStride();
     Real *this_data = data_;
     const OtherReal *other_data = M.Data();
     for (MatrixIndexT i = 0; i < num_rows_; i++)
       for (MatrixIndexT j = 0; j < num_cols_; j++)
-        this_data[i * this_stride + j] = other_data[j * other_stride + i];
+        this_data[i * this_row_stride + j * this_col_stride] =
+            other_data[j * other_row_stride + i * other_col_stride];
   }
 }
 
@@ -902,15 +906,17 @@ template<>
 template<>
 void MatrixBase<float>::CopyFromSp(const SpMatrix<float> & M) {
   KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
-  MatrixIndexT num_rows = num_rows_, stride = stride_;
+  MatrixIndexT num_rows = num_rows_,
+      row_stride = row_stride_,
+      col_stride = col_stride_;
   const float *Mdata = M.Data();
   float *row_data = data_, *col_data = data_;
   for (MatrixIndexT i = 0; i < num_rows; i++) {
-    cblas_scopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
-    cblas_scopy(i, Mdata, 1, col_data, stride); // copy to the column.
-    Mdata += i+1;
-    row_data += stride;
-    col_data += 1;
+    cblas_scopy(i + 1, Mdata, 1, row_data, col_stride); // copy to the row.
+    cblas_scopy(i, Mdata, 1, col_data, row_stride); // copy to the column.
+    Mdata += i + 1;
+    row_data += row_stride;
+    col_data += col_stride;
   }
 }
 
@@ -919,15 +925,17 @@ template<>
 template<>
 void MatrixBase<double>::CopyFromSp(const SpMatrix<double> & M) {
   KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
-  MatrixIndexT num_rows = num_rows_, stride = stride_;
+  MatrixIndexT num_rows = num_rows_,
+      row_stride = row_stride_,
+      col_stride = col_stride_;
   const double *Mdata = M.Data();
   double *row_data = data_, *col_data = data_;
   for (MatrixIndexT i = 0; i < num_rows; i++) {
-    cblas_dcopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
-    cblas_dcopy(i, Mdata, 1, col_data, stride); // copy to the column.
+    cblas_dcopy(i+1, Mdata, 1, row_data, col_stride); // copy to the row.
+    cblas_dcopy(i, Mdata, 1, col_data, row_stride); // copy to the column.
     Mdata += i+1;
-    row_data += stride;
-    col_data += 1;
+    row_data += row_stride;
+    col_data += col_stride;
   }
 }
 
@@ -956,24 +964,26 @@ template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::CopyFromTp(const TpMatrix<OtherReal> & M,
                                   MatrixTransposeType Trans) {
+  MatrixIndexT row_stride = row_stride_, col_stride = col_stride_;
   if (Trans == kNoTrans) {
     KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
     SetZero();
     Real *out_i = data_;
     const OtherReal *in_i = M.Data();
-    for (MatrixIndexT i = 0; i < num_rows_; i++, out_i += stride_, in_i += i) {
+    for (MatrixIndexT i = 0; i < num_rows_;
+         i++, out_i += row_stride_, in_i += i) {
       for (MatrixIndexT j = 0; j <= i; j++)
-        out_i[j] = in_i[j];
+        out_i[j * col_stride] = in_i[j];
     }
   } else {
     SetZero();
     KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
-    MatrixIndexT stride = stride_;
     Real *out_i = data_;
     const OtherReal *in_i = M.Data();
-    for (MatrixIndexT i = 0; i < num_rows_; i++, out_i ++, in_i += i) {
+    for (MatrixIndexT i = 0; i < num_rows_;
+         i++, out_i += col_stride, in_i += i) {
       for (MatrixIndexT j = 0; j <= i; j++)
-        out_i[j*stride] = in_i[j];
+        out_i[j * row_stride] = in_i[j];
     }
   }
 }
@@ -994,7 +1004,7 @@ void MatrixBase<double>::CopyFromTp(const TpMatrix<double> & M,
 
 template<typename Real>
 void MatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &rv) {
-  if (rv.Dim() == num_rows_*num_cols_) {
+  if (rv.Dim() == num_rows_ * num_cols_) {
     if (stride_ == num_cols_) {
       // one big copy operation.
       const Real *rv_data = rv.Data();
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 11a5e08b15d..9148f373c82 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -65,14 +65,19 @@ class MatrixBase {
   /// Returns number of columns (or zero for emtpy matrix).
   inline MatrixIndexT NumCols() const { return num_cols_; }
 
-  /// Stride (distance in memory between each row).  Will be >= NumCols.
-  inline MatrixIndexT Stride() const {  return stride_; }
+  /// Stride() is deprecated
+  inline MatrixIndexT Stride() const {  return row_stride_; }
+
+  /// The distance in memory between successive rows.  Not required to be
+  /// positive or even nonzero, as long you can't get to the same
+  /// memory location using different indexes.
+  inline MatrixIndexT RowStride() const {  return row_stride_; }
+
+  /// The distance in memory between successive columns; will normally
+  /// be 1 but it may be negative or even zero as long as you
+  /// can't get to the same memory location using differen indexes.
+  inline MatrixIndexT ColStride() const {  return col_stride_; }
 
-  /// Returns size in bytes of the data held by the matrix.
-  size_t  SizeInBytes() const {
-    return static_cast<size_t>(num_rows_) * static_cast<size_t>(stride_) *
-        sizeof(Real);
-  }
 
   /// Gives pointer to raw data (const).
   inline const Real* Data() const {
@@ -82,18 +87,19 @@ class MatrixBase {
   /// Gives pointer to raw data (non-const).
   inline Real* Data() { return data_; }
 
-  /// Returns pointer to data for one row (non-const)
+  /// Returns pointer to data for one row (non-const).
+  /// Caution: don't assume ColumnStride() is 1.
   inline  Real* RowData(MatrixIndexT i) {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return data_ + i * stride_;
+    return data_ + i * row_stride_;
   }
 
   /// Returns pointer to data for one row (const)
   inline const Real* RowData(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return data_ + i * stride_;
+    return data_ + i * row_stride_;
   }
 
   /// Indexing operator, non-const
@@ -103,7 +109,7 @@ class MatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return *(data_ + r * stride_ + c);
+    return *(data_ + r * row_stride_ + c * col_stride_);
   }
   /// Indexing operator, provided for ease of debugging (gdb doesn't work
   /// with parenthesis operator).
@@ -116,7 +122,7 @@ class MatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return *(data_ + r * stride_ + c);
+    return *(data_ + r * row_stride_ + c * col_stride_);
   }
 
   /*   Basic setting-to-special values functions. */
@@ -763,13 +769,20 @@ class MatrixBase {
   /// data memory area
   Real*   data_;
 
-  /// these atributes store the real matrix size as it is stored in memory
-  /// including memalignment
   MatrixIndexT    num_cols_;   /// < Number of columns
   MatrixIndexT    num_rows_;   /// < Number of rows
-  /** True number of columns for the internal matrix. This number may differ
-   * from num_cols_ as memory alignment might be used. */
-  MatrixIndexT    stride_;
+  MatrixIndexT    row_stride_;  ///< Row stride (distance in memory between one
+                                ///< row and the next).  Expected to
+                                ///< satisfy abs(row_stride_) >= abs(col_stride_)
+                                ///< (although this won't lead to wrong operation
+                                ///< so we don't check this);
+                                ///< and the matrix must have the property
+                                ///< that no element can be accessed via
+                                ///< two different pairs of indexes.
+  MatrixIndexT    col_stride_;  ///< Column stride (distance in memory between
+                                ///< one column and the next).  Normally
+                                ///< expected to equal 1.
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(MatrixBase);
 };
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index c8ea35112ea..4756abda94e 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -152,19 +152,20 @@ template<typename Real>
 void VectorBase<Real>::MulTp(const TpMatrix<Real> &M,
                               const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpmv(trans,M.Data(),M.NumRows(),data_,1);
+  cblas_Xtpmv(trans, M.Data(), M.NumRows(), data_, stride_);
 }
 
 template<typename Real>
 void VectorBase<Real>::Solve(const TpMatrix<Real> &M,
                         const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, 1);
+  cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, stride_);
 }
 
 
 template<typename Real>
 inline void Vector<Real>::Init(const MatrixIndexT dim) {
+  stride_ = 1;
   KALDI_ASSERT(dim >= 0);
   if (dim == 0) {
     this->dim_ = 0;
@@ -188,7 +189,6 @@ inline void Vector<Real>::Init(const MatrixIndexT dim) {
 
 template<typename Real>
 void Vector<Real>::Resize(const MatrixIndexT dim, MatrixResizeType resize_type) {
-
   // the next block uses recursion to handle what we have to do if
   // resize_type == kCopyData.
   if (resize_type == kCopyData) {
@@ -244,12 +244,6 @@ template void VectorBase<float>::CopyFromPacked(const PackedMatrix<float> &other
 template void VectorBase<double>::CopyFromPacked(const PackedMatrix<double> &other);
 template void VectorBase<double>::CopyFromPacked(const PackedMatrix<float> &other);
 
-/// Load data into the vector
-template<typename Real>
-void VectorBase<Real>::CopyFromPtr(const Real *data, MatrixIndexT sz) {
-  KALDI_ASSERT(dim_ == sz);
-  std::memcpy(this->data_, data, Dim() * sizeof(Real));
-}
 
 template<typename Real>
 template<typename OtherReal>
@@ -264,7 +258,7 @@ void VectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &other) {
 template void VectorBase<float>::CopyFromVec(const VectorBase<double> &other);
 template void VectorBase<double>::CopyFromVec(const VectorBase<float> &other);
 
-// Remove element from the vector. The vector is non reallocated
+// Remove element from the vector. The vector is not reallocated
 template<typename Real>
 void Vector<Real>::RemoveElement(MatrixIndexT i) {
   KALDI_ASSERT(i <  this->dim_ && "Access out of vector");
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index 383d8ca2862..2a50ae2f2ce 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -62,8 +62,19 @@ class VectorBase {
   /// Returns the  dimension of the vector.
   inline MatrixIndexT Dim() const { return dim_; }
 
-  /// Returns the size in memory of the vector, in bytes.
-  inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); }
+  /// Returns the stride betwen elements of the vector; will normally be 1, and
+  /// must be nonzero.  CAUTION: we are in the process of updating this library
+  /// to support vector strides, so stride != 1 may not be supported everywhere,
+  /// and may sometimes lead to unexpected behavior or crashes.
+  inline MatrixIndexT Stride() const { return stride_; }
+
+  /// Returns the size in memory of the vector, in bytes, assuming
+  /// stride is 1 (if not, this doesn't make sense in the contexts
+  /// in which this is called.  TODO: get rid of this
+  inline MatrixIndexT SizeInBytes() const {
+    KALDI_ASSERT(stride_ == 1);
+    return (dim_*sizeof(Real));
+  }
 
   /// Returns a pointer to the start of the vector's data.
   inline Real* Data() { return data_; }
@@ -75,14 +86,14 @@ class VectorBase {
   inline Real operator() (MatrixIndexT i) const {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(dim_));
-    return *(data_ + i);
+    return *(data_ + i * stride_);
   }
 
   /// Indexing operator (non-const).
   inline Real & operator() (MatrixIndexT i) {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(dim_));
-    return *(data_ + i);
+    return *(data_ + i * stride_);
   }
 
   /** @brief Returns a sub-vector of a vector (a range of elements).
@@ -360,25 +371,18 @@ class VectorBase {
   ~VectorBase() {}
 
   /// Empty initializer, corresponds to vector of zero size.
-  explicit VectorBase(): data_(NULL), dim_(0) {
+  explicit VectorBase(): data_(NULL), dim_(0), stride_(1) {
     KALDI_ASSERT_IS_FLOATING_TYPE(Real);
   }
 
-// Took this out since it is not currently used, and it is possible to create
-// objects where the allocated memory is not the same size as dim_ : Arnab
-//  /// Initializer from a pointer and a size; keeps the pointer internally
-//  /// (ownership or non-ownership depends on the child class).
-//  explicit VectorBase(Real* data, MatrixIndexT dim)
-//      : data_(data), dim_(dim) {}
-
-  // Arnab : made this protected since it is unsafe too.
-  /// Load data into the vector: sz must match own size.
-  void CopyFromPtr(const Real* Data, MatrixIndexT sz);
 
   /// data memory area
   Real* data_;
   /// dimension of vector
   MatrixIndexT dim_;
+  /// stride between elements of the vector.  Would normally be 1.  Must be
+  /// > 0  (if the vector is nonempty).
+  MatrixIndexT stride_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
 }; // class VectorBase
 
@@ -484,17 +488,24 @@ class Vector: public VectorBase<Real> {
 template<typename Real>
 class SubVector : public VectorBase<Real> {
  public:
-  /// Constructor from a Vector or SubVector.
-  /// SubVectors are not const-safe and it's very hard to make them
-  /// so for now we just give up.  This function contains const_cast.
-  SubVector(const VectorBase<Real> &t, const MatrixIndexT origin,
-            const MatrixIndexT length) : VectorBase<Real>() {
-    // following assert equiv to origin>=0 && length>=0 &&
-    // origin+length <= rt.dim_
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
-                 static_cast<UnsignedMatrixIndexT>(length) <=
-                 static_cast<UnsignedMatrixIndexT>(t.Dim()));
-    VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
+  /**
+     Constructor from a Vector or SubVector.
+     SubVectors are not const-safe and it's very hard to make them
+     so for now we just give up.  This function contains const_cast.
+        @param [in] src     The vector we are taking a sub-vector of
+        @param [in] begin   The first element in 'src'
+        @param [in] num_elements  The number of elements we are taking
+        @param [in] step   The step between elements from 'src'; must be
+                           >0.
+  */
+  SubVector(const VectorBase<Real> &src,
+            const MatrixIndexT begin,
+            const MatrixIndexT num_elements,
+            const MatrixIndexT step = 1) : VectorBase<Real>() {
+    KALDI_ASSERT(stride > 0 && static_cast<UnsignedMatrixIndexT>(src)+
+                 static_cast<UnsignedMatrixIndexT>((num_elements - 1) * step)
+                 < static_cast<UnsignedMatrixIndexT>(t.Dim()));
+    VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+src);
     VectorBase<Real>::dim_   = length;
   }
 
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
new file mode 100644
index 00000000000..e94b6978a6b
--- /dev/null
+++ b/src/tensor/tensor.h
@@ -0,0 +1,187 @@
+/**
+   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+enum {
+  kCpuDevice = 0,
+  kCudaDevice = 1
+} DeviceType;
+
+// We may later add a device number (like which GPU we are using),
+// once we support multiple GPUs.
+struct Device {
+  DeviceType device_type;
+  // operator ==, probably, maybe constructors.
+};
+
+
+// 'Storage' contains a single allocated region (on CPU or GPU, according
+// to 'device').
+struct Storage {
+  void *data;
+  size_t num_bytes;
+  Device device;
+
+  // Note: will throw if allocation fails (for now).
+  Storage(Device device, size_t num_bytes);
+
+  // Destructor deallocates 'data'.  For now there is no
+  // concept of a custom allocator or an allocator object, we just use our CuDevice stuff for cuda
+  // allocation and posix_memalign for CPU allocation (obviously we need
+  // to make sure 'data' is aligned in most specific way we might need).
+  // in future we might choose
+  // to add that.
+  ~Storage();
+};
+
+
+enum {
+  kFloatDtype = 0,
+  kDoubleDtype = 1
+} DataType;
+
+#define KALDI_TENSOR_MAX_DIM 5
+
+
+
+/*
+  This struct stores the dimension and strides of a Tensor.  The following
+  describes the properties that a Tensor will always have (note: we
+  also use TensorDim inside implementation code in ways such that these
+  properties do not all hold).
+
+  These properties are stricter than some other frameworks, such as PyTorch,
+  which allow the users to manually add dimensions with stride 0 (and dim>1) so
+  that a lower-dimensional quantity can masquerade as one with a higher
+  dimension.  We require that it never be possible to access the same
+  memory location using two different tuples of indexes.  We also
+  don't allow zero dims (i.e. a tensor must not be empty); if you want an
+  empty Tensor, just use a null pointer.
+
+    0 <= num_axes <= 5
+    for 0 <= axis < num_axes:
+       dims[i] > 0
+
+  The strides may take any value, including zero or negative, as long as the
+  uniqueness property is satisfied (i.e. must not be possible to access the
+  same memory location using two different tuples of indices.
+
+*/
+
+struct TensorDim {
+
+  int64_t num_axes;
+  int64_t dims[KALDI_TENSOR_MAX_DIM];
+  int64_t strides[KALDI_TENSOR_MAX_DIM];
+  // We may later add methods to this.
+
+  // Checks that the TensorDim is valid, assuming it is part of a Tensor.
+  // I.e. that it satifies the properties mentioned above.
+  bool Check();
+};
+
+struct TensorDimProperties {
+  // Below are cached properties that depend on a TensorDim.
+
+  // The number of elements in the Tensor, which equals the product
+  // of dims[0] .. dims[num_axes - 1].  Must always be >0.
+  int64_t num_elements;
+
+  // is_contiguous means that the data form a contiguous block in memory; it is
+  // not the same as PyTorch's is_contiguous which is a stronger condition; our
+  // has_expected_strides is equivalent to that.
+  bool is_contiguous;
+
+  // has_expected_strides means that the strides are as if this was a "c"-style
+  // multidimensional array, meaning that (using Python wrap-around indexing
+  // conventions as if strides was an array of dimension 'num_axes'),
+  // strides[-1] == 1, strides[-1] == dims[-1], strides[-2] = dims[-1] *
+  // dims[-1], and so on.  This is the same as PyTorch's is_contiguous.
+  bool has_expected_strides;
+
+  void UpdateProperties(const TensorDim &dim);
+};
+
+
+
+class Tensor {
+ public:
+  //  ...
+
+ private:
+  // The tensor dim and strides.
+  TensorDim dim_;
+  // Cached properties that depend on dim_.
+  TensorDimProperties derived_;
+  // The data-type of this tensor.
+  DataType dtype_;
+
+  // The raw data pointer
+  void *data_;
+
+  // The storage region where the data resides.  data_ does not necessarily
+  // equal storage_->data; it may be more than that, e.g. if this is a view
+  // to part of another Tensor.
+  std::shared_ptr<Storage> storage_;
+
+
+};
+
+/*
+  This is the 'gradient information' that class Variable stores for a Tensor
+  when it is initialized with requires_grad = true (or is a result of
+  an operation on Variables one of which had requires_grad = true).
+  This does not give you access to the underlying Variables; doing it
+  like this makes reference counting easier (no loops).  The GradFunc
+  will store any pointers to the original Variable that it may have
+  needed.
+
+  Users will rarely need to interact directly with this struct directly.
+ */
+struct TensorGrad {
+  // The gradients corresponding to the input variables, which
+  // we may need to update.  Some subset of these may be nullptr,
+  // corresponding to input Variables for which no gradient
+  // was required.
+  std::vector<std::shared_ptr<TensorGrad> > inputs;
+
+  // is_view is
+  bool is_view{false};
+
+  // The device we
+  Device device;
+
+  // The dimension of the Tensor for which this is the gradient.  Used
+  // to set up 'grad' when needed.
+  TensorDim dim;
+
+  // 'offset' is only inspected if this is a view; it is the offset
+  // (in elements) from the
+  // 'inputs' will just contain one member, which is the gradient for the source
+  // Variable, and we use 'dim' and 'offset' to construct the sub-tensor).
+  int64_t offset;
+
+  // This stores the gradient (if we already have one), or nullptr if not.
+  std::unique_ptr<Tensor> grad{nullptr};
+
+
+};
+
+
+class Variable {
+    using GradFunc = std::function<
+      void(std::vector<Variable>& inputs, const Variable& grad_output)>;
+
+
+};
+
+typedef std::unique_ptr<Storage>
+
+
+
+
+};

From 0688a87aef8fa7d6f3b44d004b613f300e8bf359 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 15 Mar 2019 14:20:10 -0400
Subject: [PATCH 2/4] [src] removing matrix column stride (not supported by
 BLAS); various fixes

---
 src/matrix/kaldi-matrix.cc | 112 ++++++++++++++++++-------------------
 src/matrix/kaldi-matrix.h  |  71 +++++++++++------------
 src/matrix/kaldi-vector.cc |  77 ++++++++++---------------
 src/matrix/kaldi-vector.h  |  33 ++++++++---
 src/matrix/sp-matrix.cc    |   7 ++-
 5 files changed, 145 insertions(+), 155 deletions(-)

diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index 48481f5f19e..d70ac5cefc8 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -117,10 +117,10 @@ template<>
 template<>
 void MatrixBase<float>::AddVecVec(const float alpha,
                                   const VectorBase<float> &a,
-                                  const VectorBase<float> &rb) {
-  KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
-  cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
-             1, data_, stride_);
+                                  const VectorBase<float> &b) {
+  KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_);
+  cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), a.Stride(),
+             b.Data(), b.Stride(), data_, stride_);
 }
 
 template<typename Real>
@@ -132,15 +132,18 @@ void MatrixBase<Real>::AddVecVec(const Real alpha,
   if (num_rows_ * num_cols_ > 100) { // It's probably worth it to allocate
     // temporary vectors of the right type and use BLAS.
     Vector<Real> temp_a(a), temp_b(b);
-    cblas_Xger(num_rows_, num_cols_, alpha, temp_a.Data(), 1,
-               temp_b.Data(), 1, data_, stride_);
+    cblas_Xger(num_rows_, num_cols_, alpha,
+               temp_a.Data(), temp_a.Stride(),
+               temp_b.Data(), temp_b.Stride(),
+               data_, stride_);
   } else {
     const OtherReal *a_data = a.Data(), *b_data = b.Data();
+    MatrixIndexT a_stride = a.Stride(), b_stride = b.Stride();
     Real *row_data = data_;
     for (MatrixIndexT i = 0; i < num_rows_; i++, row_data += stride_) {
-      BaseFloat alpha_ai = alpha * a_data[i];
+      BaseFloat alpha_ai = alpha * a_data[i * a_stride];
       for (MatrixIndexT j = 0; j < num_cols_; j++)
-        row_data[j] += alpha_ai * b_data[j];
+        row_data[j] += alpha_ai * b_data[j * b_stride];
     }
   }
 }
@@ -159,11 +162,11 @@ template<>
 template<>
 void MatrixBase<double>::AddVecVec(const double alpha,
                                    const VectorBase<double> &a,
-                                   const VectorBase<double> &rb) {
-  KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
+                                   const VectorBase<double> &b) {
+  KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_);
   if (num_rows_ == 0) return;
-  cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
-             1, data_, stride_);
+  cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), a.Stride(),
+             b.Data(), b.Stride(), data_, stride_);
 }
 
 template<typename Real>
@@ -591,8 +594,10 @@ void MatrixBase<Real>::AddDiagVecMat(
   if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
   Real *data = data_;
   const Real *Mdata = M.Data(), *vdata = v.Data();
+  MatrixIndexT v_stride = v.Stride();
   if (num_rows_ == 0) return;
-  for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata += M_row_stride, vdata++)
+  for (MatrixIndexT i = 0; i < num_rows;
+       i++, data += stride, Mdata += M_row_stride, vdata += v_stride)
     cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1);
 }
 
@@ -623,10 +628,11 @@ void MatrixBase<Real>::AddMatDiagVec(
 
   Real *data = data_;
   const Real *Mdata = M.Data(), *vdata = v.Data();
+  MatrixIndexT v_stride = v.Stride();
   if (num_rows_ == 0) return;
   for (MatrixIndexT i = 0; i < num_rows; i++){
       for(MatrixIndexT j = 0; j < num_cols; j ++ ){
-          data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride + j*M_col_stride];
+          data[i*stride + j] += alpha * vdata[j * v_stride] * Mdata[i*M_row_stride + j*M_col_stride];
       }
   }
 }
@@ -658,7 +664,8 @@ void MatrixBase<Real>::AddMatMatElements(const Real alpha,
 template<typename Real>
 void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
                                    MatrixBase<Real> *V_in) {
-  KALDI_ASSERT(s != NULL && U_in != this && V_in != this);
+  KALDI_ASSERT(s != NULL && U_in != this && V_in != this &&
+               s->Stride() == 1);
 
   Matrix<Real> tmpU, tmpV;
   if (U_in == NULL) tmpU.Resize(this->num_rows_, 1);  // work-space if U_in empty.
@@ -786,13 +793,12 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
     KALDI_ASSERT(rows == 0 && cols == 0);
     this->num_rows_ = 0;
     this->num_cols_ = 0;
-    this->row_stride_ = 0;
-    this->col_stride_ = 0;
+    this->stride_ = 0;
     this->data_ = NULL;
     return;
   }
   KALDI_ASSERT(rows > 0 && cols > 0);
-  MatrixIndexT skip, row_stride;
+  MatrixIndexT skip, stride;
   size_t size;
   void *data;  // aligned memory block
   void *temp;  // memory block to be really freed
@@ -800,8 +806,8 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
   // compute the size of skip and real cols
   skip = ((16 / sizeof(Real)) - cols % (16 / sizeof(Real)))
       % (16 / sizeof(Real));
-  row_stride = cols + skip;
-  size = static_cast<size_t>(rows) * static_cast<size_t>(row_stride)
+  stride = cols + skip;
+  size = static_cast<size_t>(rows) * static_cast<size_t>(stride)
       * sizeof(Real);
 
   // allocate the memory and set the right dimensions and parameters
@@ -809,8 +815,7 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
     MatrixBase<Real>::data_        = static_cast<Real *> (data);
     MatrixBase<Real>::num_rows_      = rows;
     MatrixBase<Real>::num_cols_      = cols;
-    MatrixBase<Real>::row_stride_  = (stride_type == kDefaultStride ? stride : cols);
-    MatrixBase<Real>::col_stride_  = 1;
+    MatrixBase<Real>::stride_  = (stride_type == kDefaultStride ? stride : cols);
   } else {
     throw std::bad_alloc();
   }
@@ -826,7 +831,7 @@ void Matrix<Real>::Resize(const MatrixIndexT rows,
   if (resize_type == kCopyData) {
     if (this->data_ == NULL || rows == 0) resize_type = kSetZero;  // nothing to copy.
     else if (rows == this->num_rows_ && cols == this->num_cols_ &&
-	     (stride_type == kDefaultStride || this->row_stride_ == this->num_cols_)) { return; } // nothing to do.
+	     (stride_type == kDefaultStride || this->stride_ == this->num_cols_)) { return; } // nothing to do.
     else {
       // set tmp to a matrix of the desired size; if new matrix
       // is bigger in some dimension, zero it.
@@ -876,14 +881,12 @@ void MatrixBase<Real>::CopyFromMat(const MatrixBase<OtherReal> &M,
       (*this).Row(i).CopyFromVec(M.Row(i));
   } else {
     KALDI_ASSERT(num_cols_ == M.NumRows() && num_rows_ == M.NumCols());
-    int32 this_row_stride = row_stride_, this_col_stride = col_stride_,
-        other_row_stride = M.RowStride(), other_col_stride = M.ColStride();
+    int32 this_stride = stride_, other_stride = M.Stride();
     Real *this_data = data_;
     const OtherReal *other_data = M.Data();
     for (MatrixIndexT i = 0; i < num_rows_; i++)
       for (MatrixIndexT j = 0; j < num_cols_; j++)
-        this_data[i * this_row_stride + j * this_col_stride] =
-            other_data[j * other_row_stride + i * other_col_stride];
+        this_data[i * this_stride + j] = other_data[j * other_stride + i];
   }
 }
 
@@ -906,17 +909,15 @@ template<>
 template<>
 void MatrixBase<float>::CopyFromSp(const SpMatrix<float> & M) {
   KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
-  MatrixIndexT num_rows = num_rows_,
-      row_stride = row_stride_,
-      col_stride = col_stride_;
+  MatrixIndexT num_rows = num_rows_, stride = stride_;
   const float *Mdata = M.Data();
   float *row_data = data_, *col_data = data_;
   for (MatrixIndexT i = 0; i < num_rows; i++) {
-    cblas_scopy(i + 1, Mdata, 1, row_data, col_stride); // copy to the row.
-    cblas_scopy(i, Mdata, 1, col_data, row_stride); // copy to the column.
-    Mdata += i + 1;
-    row_data += row_stride;
-    col_data += col_stride;
+    cblas_scopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
+    cblas_scopy(i, Mdata, 1, col_data, stride); // copy to the column.
+    Mdata += i+1;
+    row_data += stride;
+    col_data += 1;
   }
 }
 
@@ -925,17 +926,15 @@ template<>
 template<>
 void MatrixBase<double>::CopyFromSp(const SpMatrix<double> & M) {
   KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
-  MatrixIndexT num_rows = num_rows_,
-      row_stride = row_stride_,
-      col_stride = col_stride_;
+  MatrixIndexT num_rows = num_rows_, stride = stride_;
   const double *Mdata = M.Data();
   double *row_data = data_, *col_data = data_;
   for (MatrixIndexT i = 0; i < num_rows; i++) {
-    cblas_dcopy(i+1, Mdata, 1, row_data, col_stride); // copy to the row.
-    cblas_dcopy(i, Mdata, 1, col_data, row_stride); // copy to the column.
+    cblas_dcopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
+    cblas_dcopy(i, Mdata, 1, col_data, stride); // copy to the column.
     Mdata += i+1;
-    row_data += row_stride;
-    col_data += col_stride;
+    row_data += stride;
+    col_data += 1;
   }
 }
 
@@ -964,26 +963,24 @@ template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::CopyFromTp(const TpMatrix<OtherReal> & M,
                                   MatrixTransposeType Trans) {
-  MatrixIndexT row_stride = row_stride_, col_stride = col_stride_;
   if (Trans == kNoTrans) {
     KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
     SetZero();
     Real *out_i = data_;
     const OtherReal *in_i = M.Data();
-    for (MatrixIndexT i = 0; i < num_rows_;
-         i++, out_i += row_stride_, in_i += i) {
+    for (MatrixIndexT i = 0; i < num_rows_; i++, out_i += stride_, in_i += i) {
       for (MatrixIndexT j = 0; j <= i; j++)
-        out_i[j * col_stride] = in_i[j];
+        out_i[j] = in_i[j];
     }
   } else {
     SetZero();
     KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
+    MatrixIndexT stride = stride_;
     Real *out_i = data_;
     const OtherReal *in_i = M.Data();
-    for (MatrixIndexT i = 0; i < num_rows_;
-         i++, out_i += col_stride, in_i += i) {
+    for (MatrixIndexT i = 0; i < num_rows_; i++, out_i ++, in_i += i) {
       for (MatrixIndexT j = 0; j <= i; j++)
-        out_i[j * row_stride] = in_i[j];
+        out_i[j*stride] = in_i[j];
     }
   }
 }
@@ -1004,7 +1001,7 @@ void MatrixBase<double>::CopyFromTp(const TpMatrix<double> & M,
 
 template<typename Real>
 void MatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &rv) {
-  if (rv.Dim() == num_rows_ * num_cols_) {
+  if (rv.Dim() == num_rows_*num_cols_) {
     if (stride_ == num_cols_) {
       // one big copy operation.
       const Real *rv_data = rv.Data();
@@ -1796,7 +1793,7 @@ void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
   // Throws exception on error.
 
   KALDI_ASSERT(num_rows_>=num_cols_ && "Svd requires that #rows by >= #cols.");  // For compatibility with JAMA code.
-  KALDI_ASSERT(s->Dim() == num_cols_);  // s should be the smaller dim.
+  KALDI_ASSERT(s->Dim() == num_cols_ && s->Stride() == 1);  // s should be the smaller dim.
   KALDI_ASSERT(U == NULL || (U->num_rows_ == num_rows_&&U->num_cols_ == num_cols_));
   KALDI_ASSERT(Vt == NULL || (Vt->num_rows_ == num_cols_&&Vt->num_cols_ == num_cols_));
 
@@ -2002,27 +1999,28 @@ void MatrixBase<Real>::OrthogonalizeRows() {
 // symmetric positive definite).
 
 template<typename Real>
-void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *rU, Real check_thresh) // e.g. check_thresh = 0.001
+void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *U, Real check_thresh) // e.g. check_thresh = 0.001
 {
   const MatrixIndexT D = num_rows_;
 
   KALDI_ASSERT(num_rows_ == num_cols_);
   KALDI_ASSERT(IsSymmetric() && "SymPosSemiDefEig: expecting input to be symmetrical.");
-  KALDI_ASSERT(rU->num_rows_ == D && rU->num_cols_ == D && rs->Dim() == D);
+  KALDI_ASSERT(U->num_rows_ == D && U->num_cols_ == D && s->Dim() == D &&
+               s->Stride() == 1);
 
   Matrix<Real>  Vt(D, D);
-  Svd(rs, rU, &Vt);
+  Svd(s, U, &Vt);
 
   // First just zero any singular values if the column of U and V do not have +ve dot product--
   // this may mean we have small negative eigenvalues, and if we zero them the result will be closer to correct.
   for (MatrixIndexT i = 0;i < D;i++) {
     Real sum = 0.0;
-    for (MatrixIndexT j = 0;j < D;j++) sum += (*rU)(j, i) * Vt(i, j);
-    if (sum < 0.0) (*rs)(i) = 0.0;
+    for (MatrixIndexT j = 0;j < D;j++) sum += (*U)(j, i) * Vt(i, j);
+    if (sum < 0.0) (*s)(i) = 0.0;
   }
 
   {
-    Matrix<Real> tmpU(*rU); Vector<Real> tmps(*rs); tmps.ApplyPow(0.5);
+    Matrix<Real> tmpU(*U); Vector<Real> tmps(*s); tmps.ApplyPow(0.5);
     tmpU.MulColsVec(tmps);
     SpMatrix<Real> tmpThis(D);
     tmpThis.AddMat2(1.0, tmpU, kNoTrans, 0.0);
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 9148f373c82..4b06a22ece9 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -59,25 +59,20 @@ class MatrixBase {
   friend class SparseMatrix<float>;
   friend class SparseMatrix<double>;
 
-  /// Returns number of rows (or zero for emtpy matrix).
+  /// Returns number of rows (or zero for empty matrix).
   inline MatrixIndexT  NumRows() const { return num_rows_; }
 
   /// Returns number of columns (or zero for emtpy matrix).
   inline MatrixIndexT NumCols() const { return num_cols_; }
 
-  /// Stride() is deprecated
-  inline MatrixIndexT Stride() const {  return row_stride_; }
-
-  /// The distance in memory between successive rows.  Not required to be
-  /// positive or even nonzero, as long you can't get to the same
-  /// memory location using different indexes.
-  inline MatrixIndexT RowStride() const {  return row_stride_; }
-
-  /// The distance in memory between successive columns; will normally
-  /// be 1 but it may be negative or even zero as long as you
-  /// can't get to the same memory location using differen indexes.
-  inline MatrixIndexT ColStride() const {  return col_stride_; }
+  /// Stride (distance in memory between each row).  Must be >= NumCols().
+  inline MatrixIndexT Stride() const {  return stride_; }
 
+  /// Returns size in bytes of the data held by the matrix.
+  size_t  SizeInBytes() const {
+    return static_cast<size_t>(num_rows_) * static_cast<size_t>(stride_) *
+        sizeof(Real);
+  }
 
   /// Gives pointer to raw data (const).
   inline const Real* Data() const {
@@ -87,19 +82,18 @@ class MatrixBase {
   /// Gives pointer to raw data (non-const).
   inline Real* Data() { return data_; }
 
-  /// Returns pointer to data for one row (non-const).
-  /// Caution: don't assume ColumnStride() is 1.
+  /// Returns pointer to data for one row (non-const)
   inline  Real* RowData(MatrixIndexT i) {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return data_ + i * row_stride_;
+    return data_ + i * stride_;
   }
 
   /// Returns pointer to data for one row (const)
   inline const Real* RowData(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return data_ + i * row_stride_;
+    return data_ + i * stride_;
   }
 
   /// Indexing operator, non-const
@@ -109,7 +103,7 @@ class MatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return *(data_ + r * row_stride_ + c * col_stride_);
+    return *(data_ + r * stride_ + c);
   }
   /// Indexing operator, provided for ease of debugging (gdb doesn't work
   /// with parenthesis operator).
@@ -122,7 +116,7 @@ class MatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return *(data_ + r * row_stride_ + c * col_stride_);
+    return *(data_ + r * stride_ + c);
   }
 
   /*   Basic setting-to-special values functions. */
@@ -189,18 +183,20 @@ class MatrixBase {
 
   /* Accessing of sub-parts of the matrix. */
 
-  /// Return specific row of matrix [const].
-  inline const SubVector<Real> Row(MatrixIndexT i) const {
+  /// Return specific row of matrix.  Warning: this can get
+  /// around const constraints.
+  inline SubVector<Real> Row(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
+    return SubVector<Real>(data_ + (i * stride_), num_cols_);
   }
 
-  /// Return specific row of matrix.
-  inline SubVector<Real> Row(MatrixIndexT i) {
+  /// Return specific column of matrix.  Warning: this can get
+  /// around const constraints.
+  inline const SubVector<Real> Col(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
+                 static_cast<UnsignedMatrixIndexT>(num_cols_));
+    return SubVector<Real>(data_ + i, num_rows_, stride_);
   }
 
   /// Return a sub-part of matrix.
@@ -412,7 +408,9 @@ class MatrixBase {
      Null pointers for U and/or Vt at input mean we do not want that output.  We
      expect that S.Dim() == m, U is either NULL or m by n,
      and v is either NULL or n by n.
-     The singular values are not sorted (use SortSvd for that).  */
+     The singular values are not sorted (use SortSvd for that).
+     Requires that s->Stride() == 1.
+  */
   void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
                       MatrixBase<Real> *Vt);  // Destroys calling matrix.
 
@@ -420,6 +418,7 @@ class MatrixBase {
   /// transposed; the normal formulation is U diag(s) V^T.
   /// Null pointers for U or V mean we don't want that output (this saves
   /// compute).  The singular values are not sorted (use SortSvd for that).
+  /// Requires that s->Stride() == 1.
   void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
            MatrixBase<Real> *Vt) const;
   /// Compute SVD but only retain the singular values.
@@ -537,6 +536,7 @@ class MatrixBase {
    * positive semi-definite (check_thresh controls how stringent the check is;
    * set it to 2 to ensure it won't ever complain, but it will zero out negative
    * dimensions in your matrix.
+   * Requires s->Stride() == 1.
   */
   void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
                         Real check_thresh = 0.001);
@@ -769,20 +769,13 @@ class MatrixBase {
   /// data memory area
   Real*   data_;
 
+  /// these atributes store the real matrix size as it is stored in memory
+  /// including memalignment
   MatrixIndexT    num_cols_;   /// < Number of columns
   MatrixIndexT    num_rows_;   /// < Number of rows
-  MatrixIndexT    row_stride_;  ///< Row stride (distance in memory between one
-                                ///< row and the next).  Expected to
-                                ///< satisfy abs(row_stride_) >= abs(col_stride_)
-                                ///< (although this won't lead to wrong operation
-                                ///< so we don't check this);
-                                ///< and the matrix must have the property
-                                ///< that no element can be accessed via
-                                ///< two different pairs of indexes.
-  MatrixIndexT    col_stride_;  ///< Column stride (distance in memory between
-                                ///< one column and the next).  Normally
-                                ///< expected to equal 1.
-
+  /** True number of columns for the internal matrix. This number may differ
+   * from num_cols_ as memory alignment might be used. */
+  MatrixIndexT    stride_;
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(MatrixBase);
 };
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index 4756abda94e..655945bd01b 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -38,7 +38,7 @@ Real VecVec(const VectorBase<Real> &a,
             const VectorBase<Real> &b) {
   MatrixIndexT adim = a.Dim();
   KALDI_ASSERT(adim == b.Dim());
-  return cblas_Xdot(adim, a.Data(), 1, b.Data(), 1);
+  return cblas_Xdot(adim, a.Data(), a.Stride(), b.Data(), b.Stride());
 }
 
 template
@@ -76,7 +76,7 @@ void VectorBase<float>::AddVec(const float alpha,
                                const VectorBase<float> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1);
+  cblas_Xaxpy(dim_, alpha, v.Data(), v.stride_, data_, stride_);
 }
 
 template<>
@@ -85,7 +85,7 @@ void VectorBase<double>::AddVec(const double alpha,
                                 const VectorBase<double> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1);
+  cblas_Xaxpy(dim_, alpha, v.Data(), v.stride_, data_, stride_);
 }
 
 template<typename Real>
@@ -98,7 +98,7 @@ void VectorBase<Real>::AddMatVec(const Real alpha,
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
   cblas_Xgemv(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-              v.Data(), 1, beta, data_, 1);
+              v.Data(), v.stride_, beta, data_, stride_);
 }
 
 template<typename Real>
@@ -111,40 +111,19 @@ void VectorBase<Real>::AddMatSvec(const Real alpha,
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
   Xgemv_sparsevec(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-                  v.Data(), 1, beta, data_, 1);
+                  v.Data(), v.stride_, beta, data_, stride_);
   return;
-  /*
-  MatrixIndexT this_dim = this->dim_, v_dim = v.dim_,
-      M_stride = M.Stride();
-  Real *this_data = this->data_;
-  const Real *M_data = M.Data(), *v_data = v.data_;
-  if (beta != 1.0) this->Scale(beta);
-  if (trans == kNoTrans) {
-    for (MatrixIndexT i = 0; i < v_dim; i++) {
-      Real v_i = v_data[i];
-      if (v_i == 0.0) continue;
-      // Add to *this, the i'th column of the Matrix, times v_i.
-      cblas_Xaxpy(this_dim, v_i * alpha, M_data + i, M_stride, this_data, 1);
-    }
-  } else { // The transposed case is slightly more efficient, I guess.
-    for (MatrixIndexT i = 0; i < v_dim; i++) {
-      Real v_i = v.data_[i];
-      if (v_i == 0.0) continue;
-      // Add to *this, the i'th row of the Matrix, times v_i.
-      cblas_Xaxpy(this_dim, v_i * alpha,
-                  M_data + (i * M_stride), 1, this_data, 1);
-    }
-    }*/
 }
 
 template<typename Real>
 void VectorBase<Real>::AddSpVec(const Real alpha,
-                                 const SpMatrix<Real> &M,
-                                 const VectorBase<Real> &v,
-                                 const Real beta) {
+                                const SpMatrix<Real> &M,
+                                const VectorBase<Real> &v,
+                                const Real beta) {
   KALDI_ASSERT(M.NumRows() == v.dim_ && dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), 1, beta, data_, 1);
+  cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), v.stride_,
+              beta, data_, stride_);
 }
 
 
@@ -165,7 +144,7 @@ void VectorBase<Real>::Solve(const TpMatrix<Real> &M,
 
 template<typename Real>
 inline void Vector<Real>::Init(const MatrixIndexT dim) {
-  stride_ = 1;
+  this->stride_ = 1;
   KALDI_ASSERT(dim >= 0);
   if (dim == 0) {
     this->dim_ = 0;
@@ -672,7 +651,7 @@ void VectorBase<double>::CopyColFromMat(const MatrixBase<double> &mat, MatrixInd
 template<typename Real>
 void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
   KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
-  cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
+  cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, stride_);
 }
 
 template<typename Real>
@@ -689,7 +668,7 @@ Real VectorBase<Real>::Sum() const {
   // implement sum. This allows us to access SIMD operations in a
   // cross-platform way via your BLAS library.
   Real one(1);
-  return cblas_Xdot(dim_, data_, 1, &one, 0);
+  return cblas_Xdot(dim_, data_, stride_, &one, 0);
 }
 
 template<typename Real>
@@ -712,15 +691,16 @@ Real VectorBase<Real>::SumLog() const {
 template<typename Real>
 void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
   KALDI_ASSERT(dim_ == M.NumCols());
-  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
+  MatrixIndexT num_rows = M.NumRows(), m_stride = M.Stride(),
+      this_stride = stride_, dim = dim_;
   Real *data = data_;
 
   // implement the function according to a dimension cutoff for computation efficiency
   if (num_rows <= 64) {
-    cblas_Xscal(dim, beta, data, 1);
+    cblas_Xscal(dim, beta, data, this_stride);
     const Real *m_data = M.Data();
-    for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
-      cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
+    for (MatrixIndexT i = 0; i < num_rows; i++, m_data += m_stride)
+      cblas_Xaxpy(dim, alpha, m_data, 1, data, stride_);
 
   } else {
     Vector<Real> ones(M.NumRows());
@@ -773,17 +753,19 @@ Real VectorBase<Real>::LogSumExp(Real prune) const {
 
 template<typename Real>
 void VectorBase<Real>::InvertElements() {
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] = static_cast<Real>(1 / data_[i]);
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data_[i * stride] = static_cast<Real>(1) / data_[i * stride];
   }
 }
 
 template<typename Real>
 void VectorBase<Real>::ApplyLog() {
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    if (data_[i] < 0.0)
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    if (data_[i * stride] < 0.0)
       KALDI_ERR << "Trying to take log of a negative number.";
-    data_[i] = Log(data_[i]);
+    data_[i * stride] = Log(data_[i * stride]);
   }
 }
 
@@ -954,7 +936,7 @@ void VectorBase<Real>::Add(Real c) {
 
 template<typename Real>
 void VectorBase<Real>::Scale(Real alpha) {
-  cblas_Xscal(dim_, alpha, data_, 1);
+  cblas_Xscal(dim_, alpha, data_, stride_);
 }
 
 template<typename Real>
@@ -995,8 +977,8 @@ void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
   KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_);
   // We pretend that v is a band-diagonal matrix.
   KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_);
-  cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
-              r.data_, 1, beta, this->data_, 1);
+  cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, v.stride_,
+              r.data_, r.stride_, beta, this->data_, stride_);
 }
 
 
@@ -1304,7 +1286,8 @@ void VectorBase<Real>::AddDiagMat2(
     Real *data = this->data_;
     const Real *mat_data = M.Data();
     for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++)
-      *data = beta * *data + alpha * cblas_Xdot(cols,mat_data,1,mat_data,1);
+      *data = beta * *data + alpha * cblas_Xdot(cols, mat_data, 1,
+                                                mat_data, 1);
   } else {
     KALDI_ASSERT(this->dim_ == M.NumCols());
     MatrixIndexT rows = M.NumRows(), cols = this->dim_,
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index 2a50ae2f2ce..2a10b129ef5 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -502,11 +502,18 @@ class SubVector : public VectorBase<Real> {
             const MatrixIndexT begin,
             const MatrixIndexT num_elements,
             const MatrixIndexT step = 1) : VectorBase<Real>() {
-    KALDI_ASSERT(stride > 0 && static_cast<UnsignedMatrixIndexT>(src)+
-                 static_cast<UnsignedMatrixIndexT>((num_elements - 1) * step)
-                 < static_cast<UnsignedMatrixIndexT>(t.Dim()));
-    VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+src);
-    VectorBase<Real>::dim_   = length;
+    // Casting to UnsignedMatrixIndexT is a mechanism to test something
+    // is >= 0 as well as < x (for positive x) in a single comparison.
+    typedef UnsignedMatrixIndexT U;
+    KALDI_ASSERT(
+        step != 0 &&
+        static_cast<U>(begin) < static_cast<U>(src.Dim()) &&
+        static_cast<U>(begin + step * (num_elements - 1)) <
+        static_cast<U>(src.Dim()));
+    VectorBase<Real>::data_ = const_cast<Real*> (src.Data() +
+                                                 begin * src.Stride());
+    VectorBase<Real>::dim_   = num_elements;
+    VectorBase<Real>::stride_ = step * src.Stride();
   }
 
   /// This constructor initializes the vector to point at the contents
@@ -514,6 +521,7 @@ class SubVector : public VectorBase<Real> {
   SubVector(const PackedMatrix<Real> &M) {
     VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
     VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
+    VectorBase<Real>::stride_ = 1;
   }
 
   /// Copy constructor
@@ -521,21 +529,28 @@ class SubVector : public VectorBase<Real> {
     // this copy constructor needed for Range() to work in base class.
     VectorBase<Real>::data_ = other.data_;
     VectorBase<Real>::dim_ = other.dim_;
+    VectorBase<Real>::stride_ = other.stride_;
   }
 
-  /// Constructor from a pointer to memory and a length.  Keeps a pointer
-  /// to the data but does not take ownership (will never delete).
-  /// Caution: this constructor enables you to evade const constraints.
-  SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
+  /// Constructor from a pointer to memory and a length, and an optional stride.
+  /// Keeps a pointer to the data but does not take ownership (will never
+  /// delete).  Caution: this constructor enables you to evade const
+  /// constraints.
+  SubVector(const Real *data, MatrixIndexT length, MatrixIndexT stride = 1):
+      VectorBase<Real> () {
     VectorBase<Real>::data_ = const_cast<Real*>(data);
     VectorBase<Real>::dim_   = length;
+    VectorBase<Real>::stride_ = stride;
   }
 
 
   /// This operation does not preserve const-ness, so be careful.
+  /// This function is somewhat deprecated, for being ambiguous
+  /// MatrixBase:Row() is probably preferred.
   SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
     VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
     VectorBase<Real>::dim_   = matrix.NumCols();
+    VectorBase<Real>::stride_ = 1;
   }
 
   ~SubVector() {}  ///< Destructor (does nothing; no pointers are owned here).
diff --git a/src/matrix/sp-matrix.cc b/src/matrix/sp-matrix.cc
index 224ef39fb6e..40511f537ef 100644
--- a/src/matrix/sp-matrix.cc
+++ b/src/matrix/sp-matrix.cc
@@ -180,16 +180,17 @@ Real SpMatrix<Real>::Trace() const {
 // diagonal update, this <-- this + diag(v)
 template<typename Real>
 template<typename OtherReal>
-void  SpMatrix<Real>::AddDiagVec(const Real alpha, const VectorBase<OtherReal> &v) {
+void SpMatrix<Real>::AddDiagVec(const Real alpha, const VectorBase<OtherReal> &v) {
   int32 num_rows = this->num_rows_;
   KALDI_ASSERT(num_rows == v.Dim() && num_rows > 0);
   const OtherReal *src = v.Data();
   Real *dst = this->data_;
+  MatrixIndexT src_stride = v.Stride();
   if (alpha == 1.0)
-    for (int32 i = 1; i <= num_rows; i++, src++, dst += i)
+    for (int32 i = 1; i <= num_rows; i++, src += src_stride, dst += i)
       *dst += *src;
   else
-    for (int32 i = 1; i <= num_rows; i++, src++, dst += i)
+    for (int32 i = 1; i <= num_rows; i++, src += src_stride, dst += i)
       *dst += alpha * *src;
 }
 

From dda4479af692dd84175494294717edff95683976 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 15 Mar 2019 14:20:40 -0400
Subject: [PATCH 3/4] [src] some updates to kaldi-vector.cc RE column stride,
 etc.

---
 src/matrix/kaldi-vector.cc | 63 +++++++++++---------------------------
 1 file changed, 18 insertions(+), 45 deletions(-)

diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index 4756abda94e..fd6b76459ee 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -55,9 +55,10 @@ Real VecVec(const VectorBase<Real> &ra,
   KALDI_ASSERT(adim == rb.Dim());
   const Real *a_data = ra.Data();
   const OtherReal *b_data = rb.Data();
+  MatrixIndexT a_stride = a.Stride(), b_stride = b.Stride();
   Real sum = 0.0;
   for (MatrixIndexT i = 0; i < adim; i++)
-    sum += a_data[i]*b_data[i];
+    sum += a_data[i * a_stride] * b_data[i * b_stride];
   return sum;
 }
 
@@ -111,30 +112,8 @@ void VectorBase<Real>::AddMatSvec(const Real alpha,
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
   Xgemv_sparsevec(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-                  v.Data(), 1, beta, data_, 1);
+                  v.Data(), v.Stride(), beta, data_, stride_);
   return;
-  /*
-  MatrixIndexT this_dim = this->dim_, v_dim = v.dim_,
-      M_stride = M.Stride();
-  Real *this_data = this->data_;
-  const Real *M_data = M.Data(), *v_data = v.data_;
-  if (beta != 1.0) this->Scale(beta);
-  if (trans == kNoTrans) {
-    for (MatrixIndexT i = 0; i < v_dim; i++) {
-      Real v_i = v_data[i];
-      if (v_i == 0.0) continue;
-      // Add to *this, the i'th column of the Matrix, times v_i.
-      cblas_Xaxpy(this_dim, v_i * alpha, M_data + i, M_stride, this_data, 1);
-    }
-  } else { // The transposed case is slightly more efficient, I guess.
-    for (MatrixIndexT i = 0; i < v_dim; i++) {
-      Real v_i = v.data_[i];
-      if (v_i == 0.0) continue;
-      // Add to *this, the i'th row of the Matrix, times v_i.
-      cblas_Xaxpy(this_dim, v_i * alpha,
-                  M_data + (i * M_stride), 1, this_data, 1);
-    }
-    }*/
 }
 
 template<typename Real>
@@ -144,7 +123,8 @@ void VectorBase<Real>::AddSpVec(const Real alpha,
                                  const Real beta) {
   KALDI_ASSERT(M.NumRows() == v.dim_ && dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), 1, beta, data_, 1);
+  cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), v.Stride(), beta,
+              data_, stride_);
 }
 
 
@@ -672,14 +652,15 @@ void VectorBase<double>::CopyColFromMat(const MatrixBase<double> &mat, MatrixInd
 template<typename Real>
 void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
   KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
-  cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
+  cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, stride_);
 }
 
 template<typename Real>
 void VectorBase<Real>::CopyDiagFromPacked(const PackedMatrix<Real> &M) {
   KALDI_ASSERT(dim_ == M.NumCols());
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    data_[i] = M(i, i);
+  MatrixIndexT stride = stride_, dim = dim_;
+  for (MatrixIndexT i = 0; i < dim; i++)
+    data_[i * stride] = M(i, i);
   // could make this more efficient.
 }
 
@@ -689,15 +670,16 @@ Real VectorBase<Real>::Sum() const {
   // implement sum. This allows us to access SIMD operations in a
   // cross-platform way via your BLAS library.
   Real one(1);
-  return cblas_Xdot(dim_, data_, 1, &one, 0);
+  return cblas_Xdot(dim_, data_, stride_, &one, 0);
 }
 
 template<typename Real>
 Real VectorBase<Real>::SumLog() const {
   double sum_log = 0.0;
   double prod = 1.0;
+  MatrixIndexT dim = dim_, stride = stride_;
   for (MatrixIndexT i = 0; i < dim_; i++) {
-    prod *= data_[i];
+    prod *= data_[i * stride];
     // Possible future work (arnab): change these magic values to pre-defined
     // constants
     if (prod < 1.0e-10 || prod > 1.0e+10) {
@@ -710,23 +692,14 @@ Real VectorBase<Real>::SumLog() const {
 }
 
 template<typename Real>
-void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
+void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M,
+                                    Real beta) {
   KALDI_ASSERT(dim_ == M.NumCols());
-  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
-  Real *data = data_;
 
-  // implement the function according to a dimension cutoff for computation efficiency
-  if (num_rows <= 64) {
-    cblas_Xscal(dim, beta, data, 1);
-    const Real *m_data = M.Data();
-    for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
-      cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
-
-  } else {
-    Vector<Real> ones(M.NumRows());
-    ones.Set(1.0);
-    this->AddMatVec(alpha, M, kTrans, ones, beta);
-  }
+  // treat 'one' as a vector with stride zero.
+  Real one(1);
+  cblas_Xgemv(kTrans, M.NumRows(), M.NumCols(), alpha, M.Data(),
+              M.Stride(), &one, 0, data_, stride_, beta, data_, stride_);
 }
 
 template<typename Real>

From d32f22e4cc934f3c3de9abd005325bcb362f1180 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 15 Mar 2019 15:50:24 -0400
Subject: [PATCH 4/4] [egs] Revert unwanted change to cmd.sh

---
 egs/mini_librispeech/s5/cmd.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh
index 223eb21c55d..71dd849a93b 100644
--- a/egs/mini_librispeech/s5/cmd.sh
+++ b/egs/mini_librispeech/s5/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="retry.pl --num-tries 3 queue.pl --mem 2G"
-export decode_cmd="retry.pl --num-tries 3 queue.pl --mem 4G"
-export mkgraph_cmd="retry.pl --num-tries 3 queue.pl --mem 8G"
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"