<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">From d94cd0d3eec33e4290d7ca81918f5ac61444886e Mon Sep 17 00:00:00 2001
From: Dumitru Ceara &lt;dceara@redhat.com&gt;
Date: Tue, 26 Apr 2022 12:37:08 +0200
Subject: [PATCH] ovsdb-idl: Support write-only-changed IDL monitor mode.

At a first glance, change tracking should never be allowed for
write-only columns.  However, some clients (e.g., ovn-northd) that are
mostly exclusive writers of a database, use change tracking to avoid
duplicating the IDL row records into a local cache when implementing
incremental processing.

The default behavior of the IDL is to automatically turn a write-only
column into a read-write column whenever the client enables change
tracking for that column.

For the afore mentioned clients, this becomes a performance issue.
Commit 1cc618c32524 ("ovsdb-idl: Fix atomicity of writes that don't
change a column's value.") explains why writes that don't change a
column's value cannot be optimized out early if the column is
read/write.

Furthermore, if there is at least one record in any table that
changed during a transaction, then *all* records that have been
written are added to the transaction, even if their values didn't
change.  If there are many such rows (e.g., like in ovn-northd's
case) this incurs a significant overhead because:
a. the client has to build this large transaction
b. the transaction has to be sent over the network
c. the server needs to parse this (mostly) no-op update

We now introduce new IDL APIs allowing users to set a new monitoring
mode flag, OVSDB_IDL_WRITE_CHANGED_ONLY, to indicate to the IDL that the
atomicity constraints may be relaxed and written columns that don't
change value can be skipped from the current transaction.

We benchmarked ovn-northd performance when using this new mode
against NB and SB databases taken from ovn-kubernetes scale tests.
We noticed that when a minor change is performed to the Northbound
database (e.g., NB_Global.nb_cfg is incremented) the time it takes to
build the Southbound transaction becomes negligible (vs ~1.5 seconds
before this change).

End-to-end ovn-kubernetes scale tests on 120-node clusters also show
significant reduction of latency to bring up pods; both average and P99
latency decreased by ~30%.

Acked-by: Han Zhou &lt;hzhou@ovn.org&gt;
Signed-off-by: Dumitru Ceara &lt;dceara@redhat.com&gt;
Signed-off-by: Ilya Maximets &lt;i.maximets@ovn.org&gt;
---
 lib/ovsdb-idl.c    | 60 +++++++++++++++++++++++++++++++++++++++++-----
 lib/ovsdb-idl.h    | 16 +++++++++++--
 tests/ovsdb-idl.at | 31 +++++++++++++++++++++++-
 tests/test-ovsdb.c | 18 ++++++++++----
 5 files changed, 115 insertions(+), 14 deletions(-)

--- a/lib/ovsdb-idl.c
+++ b/lib/ovsdb-idl.c
@@ -1398,6 +1398,45 @@ ovsdb_idl_track_clear(struct ovsdb_idl *
 {
     ovsdb_idl_track_clear__(idl, false);
 }
+
+/* Sets or clears (depending on 'enable') OVSDB_IDL_WRITE_CHANGED_ONLY
+ * for 'column' in 'idl', ensuring that the column will be included in a
+ * transaction only if its value has actually changed locally.  Normally
+ * read/write columns that are written to are always included in the
+ * transaction but, in specific cases, when the application doesn't
+ * require atomicity of writes across different columns, the ones that
+ * don't change value may be skipped.
+ *
+ * This function should be called between ovsdb_idl_create() and
+ * the first call to ovsdb_idl_run().
+ */
+void
+ovsdb_idl_set_write_changed_only(struct ovsdb_idl *idl,
+                                 const struct ovsdb_idl_column *column,
+                                 bool enable)
+{
+    if (enable) {
+        *ovsdb_idl_get_mode(idl, column) |= OVSDB_IDL_WRITE_CHANGED_ONLY;
+    } else {
+        *ovsdb_idl_get_mode(idl, column) &amp;= ~OVSDB_IDL_WRITE_CHANGED_ONLY;
+    }
+}
+
+/* Helper function to wrap calling ovsdb_idl_set_write_changed_only() for
+ * all columns that are part of 'idl'.
+ */
+void
+ovsdb_idl_set_write_changed_only_all(struct ovsdb_idl *idl, bool enable)
+{
+    for (size_t i = 0; i &lt; idl-&gt;class_-&gt;n_tables; i++) {
+        const struct ovsdb_idl_table_class *tc = &amp;idl-&gt;class_-&gt;tables[i];
+
+        for (size_t j = 0; j &lt; tc-&gt;n_columns; j++) {
+            const struct ovsdb_idl_column *column = &amp;tc-&gt;columns[j];
+            ovsdb_idl_set_write_changed_only(idl, column, enable);
+        }
+    }
+}
 
 static void
 log_parse_update_error(struct ovsdb_error *error)
@@ -3502,6 +3541,8 @@ ovsdb_idl_txn_write__(const struct ovsdb
 {
     struct ovsdb_idl_row *row = CONST_CAST(struct ovsdb_idl_row *, row_);
     const struct ovsdb_idl_table_class *class;
+    unsigned char column_mode;
+    bool optimize_rewritten;
     size_t column_idx;
     bool write_only;
 
@@ -3512,12 +3553,15 @@ ovsdb_idl_txn_write__(const struct ovsdb
 
     class = row-&gt;table-&gt;class_;
     column_idx = column - class-&gt;columns;
-    write_only = row-&gt;table-&gt;modes[column_idx] == OVSDB_IDL_MONITOR;
+    column_mode = row-&gt;table-&gt;modes[column_idx];
+    write_only = column_mode == OVSDB_IDL_MONITOR;
+    optimize_rewritten =
+        write_only || (column_mode &amp; OVSDB_IDL_WRITE_CHANGED_ONLY);
+
 
     ovs_assert(row-&gt;new_datum != NULL);
     ovs_assert(column_idx &lt; class-&gt;n_columns);
-    ovs_assert(row-&gt;old_datum == NULL ||
-               row-&gt;table-&gt;modes[column_idx] &amp; OVSDB_IDL_MONITOR);
+    ovs_assert(row-&gt;old_datum == NULL || column_mode &amp; OVSDB_IDL_MONITOR);
 
     if (row-&gt;table-&gt;idl-&gt;verify_write_only &amp;&amp; !write_only) {
         VLOG_ERR("Bug: Attempt to write to a read/write column (%s:%s) when"
@@ -3535,9 +3579,13 @@ ovsdb_idl_txn_write__(const struct ovsdb
      * different value in that column since we read it.  (But if a whole
      * transaction only does writes of existing values, without making any real
      * changes, we will drop the whole transaction later in
-     * ovsdb_idl_txn_commit().) */
-    if (write_only &amp;&amp; ovsdb_datum_equals(ovsdb_idl_read(row, column),
-                                         datum, &amp;column-&gt;type)) {
+     * ovsdb_idl_txn_commit().)
+     *
+     * The application may choose to bypass this restriction and always
+     * optimize by setting OVSDB_IDL_WRITE_CHANGED_ONLY.
+     */
+    if (optimize_rewritten &amp;&amp; ovsdb_datum_equals(ovsdb_idl_read(row, column),
+                                                 datum, &amp;column-&gt;type)) {
         goto discard_datum;
     }
 
--- a/lib/ovsdb-idl.h
+++ b/lib/ovsdb-idl.h
@@ -158,7 +158,7 @@ bool ovsdb_idl_server_has_column(const s
  * IDL will change the value returned by ovsdb_idl_get_seqno() when the
  * column's value changes in any row.
  *
- * The possible mode combinations are:
+ * Typical mode combinations are:
  *
  *   - 0, for a column that a client doesn't care about.  This is the default
  *     for every column in every table, if the client passes false for
@@ -181,10 +181,17 @@ bool ovsdb_idl_server_has_column(const s
  *   - (OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT | OVSDB_IDL_TRACK), for a column
  *     that a client wants to track using the change tracking
  *     ovsdb_idl_track_get_*() functions.
+ *
+ *   - (OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT | OVSDB_IDL_WRITE_CHANGED_ONLY)
+ *     is similar to (OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT) except that it
+ *     only adds a written column to a transaction if the column's value
+ *     has actually changed.
  */
 #define OVSDB_IDL_MONITOR (1 &lt;&lt; 0) /* Replicate this column? */
 #define OVSDB_IDL_ALERT   (1 &lt;&lt; 1) /* Alert client when column changes? */
-#define OVSDB_IDL_TRACK   (1 &lt;&lt; 2)
+#define OVSDB_IDL_TRACK   (1 &lt;&lt; 2) /* Track column changes? */
+#define OVSDB_IDL_WRITE_CHANGED_ONLY \
+                          (1 &lt;&lt; 3) /* Write back only changed columns? */
 
 void ovsdb_idl_add_column(struct ovsdb_idl *, const struct ovsdb_idl_column *);
 void ovsdb_idl_add_table(struct ovsdb_idl *,
@@ -233,6 +240,11 @@ bool ovsdb_idl_track_is_updated(const st
                                 const struct ovsdb_idl_column *column);
 void ovsdb_idl_track_clear(struct ovsdb_idl *);
 
+void ovsdb_idl_set_write_changed_only(struct ovsdb_idl *idl,
+                                      const struct ovsdb_idl_column *column,
+                                      bool enable);
+void ovsdb_idl_set_write_changed_only_all(struct ovsdb_idl *idl, bool enable);
+
 
 /* Reading the database replica. */
 
--- a/tests/ovsdb-idl.at
+++ b/tests/ovsdb-idl.at
@@ -94,6 +94,20 @@ m4_define([OVSDB_CHECK_IDL_C],
    OVSDB_SERVER_SHUTDOWN
    AT_CLEANUP])
 
+# same as OVSDB_CHECK_IDL but uses OVSDB_IDL_WRITE_CHANGED_ONLY.
+m4_define([OVSDB_CHECK_IDL_WRITE_CHANGED_ONLY_C],
+  [AT_SETUP([$1 - write-changed-only - C])
+   AT_KEYWORDS([ovsdb server idl positive $5])
+   AT_CHECK([ovsdb_start_idltest])
+   m4_if([$2], [], [],
+     [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])])
+   AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $3],
+            [0], [stdout], [ignore])
+   AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]),
+            [0], [$4])
+   OVSDB_SERVER_SHUTDOWN
+   AT_CLEANUP])
+
 # same as OVSDB_CHECK_IDL but uses tcp.
 m4_define([OVSDB_CHECK_IDL_TCP_C],
   [AT_SETUP([$1 - C - tcp])
@@ -264,6 +278,7 @@ m4_define([OVSDB_CHECK_IDL_SSL_PY],
 
 m4_define([OVSDB_CHECK_IDL],
   [OVSDB_CHECK_IDL_C($@)
+  OVSDB_CHECK_IDL_WRITE_CHANGED_ONLY_C($@)
    OVSDB_CHECK_IDL_TCP_C($@)
    OVSDB_CHECK_IDL_TCP6_C($@)
    OVSDB_CHECK_IDL_PY($@)
@@ -1202,8 +1217,22 @@ m4_define([OVSDB_CHECK_IDL_TRACK_C],
    OVSDB_SERVER_SHUTDOWN
    AT_CLEANUP])
 
+m4_define([OVSDB_CHECK_IDL_TRACK_WRITE_CHANGED_ONLY_C],
+  [AT_SETUP([$1 - write-changed-only - C])
+   AT_KEYWORDS([ovsdb server idl tracking positive $5])
+   AT_CHECK([ovsdb_start_idltest])
+   m4_if([$2], [], [],
+     [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])])
+   AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c -w idl unix:socket $3],
+            [0], [stdout], [ignore])
+   AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]),
+            [0], [$4])
+   OVSDB_SERVER_SHUTDOWN
+   AT_CLEANUP])
+
 m4_define([OVSDB_CHECK_IDL_TRACK],
-  [OVSDB_CHECK_IDL_TRACK_C($@)])
+  [OVSDB_CHECK_IDL_TRACK_C($@)
+   OVSDB_CHECK_IDL_TRACK_WRITE_CHANGED_ONLY_C($@)])
 
 OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated],
   [['["idltest",
--- a/tests/test-ovsdb.c
+++ b/tests/test-ovsdb.c
@@ -56,6 +56,7 @@
 VLOG_DEFINE_THIS_MODULE(test_ovsdb);
 
 struct test_ovsdb_pvt_context {
+    bool write_changed_only;
     bool track;
 };
 
@@ -91,6 +92,7 @@ parse_options(int argc, char *argv[], st
         {"timeout", required_argument, NULL, 't'},
         {"verbose", optional_argument, NULL, 'v'},
         {"change-track", optional_argument, NULL, 'c'},
+        {"write-changed-only", optional_argument, NULL, 'w'},
         {"magic", required_argument, NULL, OPT_MAGIC},
         {"no-rename-open-files", no_argument, NULL, OPT_NO_RENAME_OPEN_FILES},
         {"help", no_argument, NULL, 'h'},
@@ -125,6 +127,10 @@ parse_options(int argc, char *argv[], st
             pvt-&gt;track = true;
             break;
 
+        case 'w':
+            pvt-&gt;write_changed_only = true;
+            break;
+
         case OPT_MAGIC:
             magic = optarg;
             break;
@@ -2681,6 +2687,7 @@ update_conditions(struct ovsdb_idl *idl,
 static void
 do_idl(struct ovs_cmdl_context *ctx)
 {
+    struct test_ovsdb_pvt_context *pvt = ctx-&gt;pvt;
     struct jsonrpc *rpc;
     struct ovsdb_idl *idl;
     unsigned int next_cond_seqno = 0;
@@ -2690,9 +2697,6 @@ do_idl(struct ovs_cmdl_context *ctx)
     int step = 0;
     int error;
     int i;
-    bool track;
-
-    track = ((struct test_ovsdb_pvt_context *)(ctx-&gt;pvt))-&gt;track;
 
     idl = ovsdb_idl_create(ctx-&gt;argv[1], &amp;idltest_idl_class, true, true);
     ovsdb_idl_set_leader_only(idl, false);
@@ -2709,10 +2713,14 @@ do_idl(struct ovs_cmdl_context *ctx)
         rpc = NULL;
     }
 
-    if (track) {
+    if (pvt-&gt;track) {
         ovsdb_idl_track_add_all(idl);
     }
 
+    if (pvt-&gt;write_changed_only) {
+        ovsdb_idl_set_write_changed_only_all(idl, true);
+    }
+
     setvbuf(stdout, NULL, _IONBF, 0);
 
     symtab = ovsdb_symbol_table_create();
@@ -2770,7 +2778,7 @@ do_idl(struct ovs_cmdl_context *ctx)
             }
 
             /* Print update. */
-            if (track) {
+            if (pvt-&gt;track) {
                 print_idl_track(idl, step++, terse);
                 ovsdb_idl_track_clear(idl);
             } else {
</pre></body></html>