Fix CSV multiple edge file import

Summary: The CSV importer used to generate non-unique edge IDs when generating edges from multiple CSV files. This is incompatible with the unique ID requirements introduced by the WAL. Tested and fixed in this diff. Reviewers: teon.banek, mferencevic Reviewed By: teon.banek Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D980
2017-11-14 14:06:19 +01:00 · 2017-11-14 14:06:19 +01:00 · b7188c296c
commit b7188c296c
parent 885d19401f
5 changed files with 18 additions and 17 deletions
--- a/tools/src/mg_import_csv/main.cpp
+++ b/tools/src/mg_import_csv/main.cpp
@ -300,10 +300,10 @@ void WriteRelationshipsRow(
  encoder.WriteMap(properties);
 }

-auto ConvertRelationships(
+void ConvertRelationships(
    const std::string &relationships_path, const MemgraphNodeIdMap &node_id_map,
-    communication::bolt::BaseEncoder<HashedFileWriter> &encoder) {
-  int64_t relationship_count = 0;
+    communication::bolt::BaseEncoder<HashedFileWriter> &encoder,
+    int64_t &next_relationship_id) {
  std::ifstream relationships_file(relationships_path);
  CHECK(relationships_file)
      << fmt::format("Unable to open '{}'", relationships_path);
@ -312,13 +312,10 @@ auto ConvertRelationships(
  while (!row.empty()) {
    CHECK_EQ(row.size(), fields.size())
        << "Expected as many values as there are header fields";
-    auto relationship_id = relationship_count;
-    WriteRelationshipsRow(fields, row, node_id_map, relationship_id, encoder);
-    // Increase count and move to next row.
-    relationship_count += 1;
+    WriteRelationshipsRow(fields, row, node_id_map, next_relationship_id++,
+                          encoder);
    row = ReadRow(relationships_file);
  }
-  return relationship_count;
 }

 void Convert(const std::vector<std::string> &nodes,
@ -328,7 +325,7 @@ void Convert(const std::vector<std::string> &nodes,
    HashedFileWriter buffer(output_path);
    communication::bolt::BaseEncoder<HashedFileWriter> encoder(buffer);
    int64_t node_count = 0;
-    int64_t relationship_count = 0;
+    int64_t next_relationship_id = 0;
    MemgraphNodeIdMap node_id_map;
    // Snapshot file has the following contents in order:
    //   1) Magic number.
@ -349,11 +346,11 @@ void Convert(const std::vector<std::string> &nodes,
      node_count += ConvertNodes(nodes_file, node_id_map, encoder);
    }
    for (const auto &relationships_file : relationships) {
-      relationship_count +=
-          ConvertRelationships(relationships_file, node_id_map, encoder);
+      ConvertRelationships(relationships_file, node_id_map, encoder,
+                           next_relationship_id);
    }
    buffer.WriteValue(node_count);
-    buffer.WriteValue(relationship_count);
+    buffer.WriteValue(next_relationship_id);
    buffer.WriteValue(buffer.hash());
  } catch (const std::ios_base::failure &) {
    // Only HashedFileWriter sets the underlying fstream to throw.
--- a/tools/tests/CMakeLists.txt
+++ b/tools/tests/CMakeLists.txt
@ -25,7 +25,8 @@ target_link_libraries(mg_recovery_check gtest gtest_main)
 # Copy CSV data to CMake build dir
 configure_file(csv/comment_nodes.csv csv/comment_nodes.csv COPYONLY)
 configure_file(csv/forum_nodes.csv csv/forum_nodes.csv COPYONLY)
-configure_file(csv/relationships.csv csv/relationships.csv COPYONLY)
+configure_file(csv/relationships_0.csv csv/relationships_0.csv COPYONLY)
+configure_file(csv/relationships_1.csv csv/relationships_1.csv COPYONLY)
 # Copy the actual runner to CMake build dir
 configure_file(test_mg_import_csv test_mg_import_csv COPYONLY)

--- a/tools/tests/csv/relationships_0.csv
+++ b/tools/tests/csv/relationships_0.csv
@ -2,5 +2,3 @@
 0|0|POSTED_ON
 1|1|POSTED_ON
 2|2|POSTED_ON
-3|3|POSTED_ON
-4|4|POSTED_ON
--- a/tools/tests/csv/relationships_1.csv
+++ b/tools/tests/csv/relationships_1.csv
@ -0,0 +1,3 @@
+:START_ID(COMMENT_ID)|:END_ID(FORUM_ID)|:TYPE
+3|3|POSTED_ON
+4|4|POSTED_ON
--- a/tools/tests/test_mg_import_csv
+++ b/tools/tests/test_mg_import_csv
@ -23,11 +23,13 @@ def main():
    args = parse_args()
    comment_nodes = os.path.join(_SCRIPT_DIR, 'csv', 'comment_nodes.csv')
    forum_nodes = os.path.join(_SCRIPT_DIR, 'csv', 'forum_nodes.csv')
-    relationships = os.path.join(_SCRIPT_DIR, 'csv', 'relationships.csv')
+    relationships_0 = os.path.join(_SCRIPT_DIR, 'csv', 'relationships_0.csv')
+    relationships_1 = os.path.join(_SCRIPT_DIR, 'csv', 'relationships_1.csv')
    with tempfile.TemporaryDirectory(suffix='-snapshots', dir=_SCRIPT_DIR) as snapshot_dir:
        out_snapshot = os.path.join(snapshot_dir, 'snapshot')
        mg_import_csv = [args.mg_import_csv, '--nodes', comment_nodes,
-                         '--nodes', forum_nodes, '--relationships', relationships,
+                         '--nodes', forum_nodes, '--relationships', relationships_0,
+                         '--relationships', relationships_1,
                         '--out', out_snapshot, '--csv-delimiter=|', '--array-delimiter=;']
        subprocess.check_call(mg_import_csv)
        mg_recovery_check = [args.mg_recovery_check, '--snapshot-dir', snapshot_dir]