From 6294bf19ecc2042c06d0db7fb0f0038540636f9e Mon Sep 17 00:00:00 2001 From: Teon Banek Date: Thu, 21 Dec 2017 14:06:36 +0100 Subject: [PATCH] Add documentation for mg_import_csv Reviewers: mtomic, buda, florijan, mferencevic Reviewed By: florijan Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D1073 --- docs/user_technical/README.md | 1 + docs/user_technical/examples.md | 2 +- docs/user_technical/import-tools.md | 118 ++++++++++++++++++++++++++++ tools/src/mg_import_csv/main.cpp | 25 ++++-- tools/tests/test_mg_import_csv | 3 +- 5 files changed, 142 insertions(+), 7 deletions(-) create mode 100644 docs/user_technical/import-tools.md diff --git a/docs/user_technical/README.md b/docs/user_technical/README.md index 428713e6a..4203e2db1 100644 --- a/docs/user_technical/README.md +++ b/docs/user_technical/README.md @@ -16,6 +16,7 @@ data structures, multi-version concurrency control and asynchronous IO. * [Drivers](drivers.md) * [Storable Data Types](data-types.md) * [openCypher Query Language](open-cypher.md) + * [Import Tools](import-tools.md) * [Upcoming Features](upcoming-features.md) [//]: # (Nothing should go below the contents section) diff --git a/docs/user_technical/examples.md b/docs/user_technical/examples.md index a35bdca83..7629ef152 100644 --- a/docs/user_technical/examples.md +++ b/docs/user_technical/examples.md @@ -35,7 +35,7 @@ We have prepared a database snapshot for this example, so you can easily import when starting memgraph using `durability-directory` option: ``` -./memgraph --durability-directory /usr/share/memgraph/examples/TEDTalk --durability-enabled=false +memgraph --durability-directory /usr/share/memgraph/examples/TEDTalk --durability-enabled=false ``` NOTE: If you modify dataset these changes will stay diff --git a/docs/user_technical/import-tools.md b/docs/user_technical/import-tools.md new file mode 100644 index 000000000..7815b0003 --- /dev/null +++ b/docs/user_technical/import-tools.md @@ -0,0 +1,118 @@ +## Import Tools + +Memgraph comes with tools for importing data into the database. Currently, +only import of CSV formatted is supported. We plan to support more formats in +the future. + +### CSV Import Tool + +CSV data should be in Neo4j CSV compatible format. Detailed format +specification can be found +[here](https://neo4j.com/docs/operations-manual/current/tools/import/file-header-format/). + +The import tool is run from the console, using the `mg_import_csv` command. + +If you installed Memgraph using Docker, you will need to run the importer +using the following command: + +``` +docker run -v mg_lib:/var/lib/memgraph -v mg_etc:/etc/memgraph -v mg_import:/import-data \ + --entrypoint=mg_import_csv memgraph +``` + +You can pass CSV files containing node data using the `--nodes` option. +Multiple files can be specified by repeating the `--nodes` option. At least +one node file should be specified. Similarly, graph edges (also known as +relationships) are passed via the `--relationships` option. Multiple +relationship files are imported by repeating the option. Unlike nodes, +relationships are not required. + +After reading the CSV files, the tool will by default search for the installed +Memgraph configuration. If the configuration is found, the data will be +written in the configured durability directory. If the configuration isn't +found, you will need to use the `--out` option to specify the output file. You +can use the same option to override the default behaviour. + +Memgraph will recover the imported data on the next startup by looking in the +durability directory. + +For information on other options, run: + +``` +mg_import_csv --help +``` + +When using Docker, this translates to: + +``` +docker run --entrypoint=mg_import_csv memgraph --help +``` + +#### Example + +Let's import a simple dataset. + +Store the following in `comment_nodes.csv`. + +``` +id:ID(COMMENT_ID),country:string,browser:string,content:string,:LABEL +0,Croatia,Chrome,yes,Message;Comment +1,United Kingdom,Chrome,thanks,Message;Comment +2,Germany,,LOL,Message;Comment +3,France,Firefox,I see,Message;Comment +4,Italy,Internet Explorer,fine,Message;Comment +``` + +Now, let's add `forum_nodes.csv`. + +``` +id:ID(FORUM_ID),title:string,:LABEL +0,General,Forum +1,Support,Forum +2,Music,Forum +3,Film,Forum +4,Programming,Forum +``` + +And finally, set relationships between comments and forums in +`relationships.csv`. + +``` +:START_ID(COMMENT_ID),:END_ID(FORUM_ID),:TYPE +0,0,POSTED_ON +1,1,POSTED_ON +2,2,POSTED_ON +3,3,POSTED_ON +4,4,POSTED_ON +``` + +Now, you can import the dataset in Memgraph. + +WARNING: Your existing recovery data will be considered obsolete, and Memgraph +will load the new dataset. + +Use the following command: + +``` +mg_import_csv --nodes=comment_nodes.csv --nodes=forum_nodes.csv --relationships=relationships.csv +``` + +If using Docker, things are a bit more complicated. First you need to move the +CSV files where the Docker image can see them: + +``` +mkdir -p /var/lib/docker/volumes/mg_import/_data +cp comment_nodes.csv forum_nodes.csv relationships.csv /var/lib/docker/volumes/mg_import/_data +``` + +Then, run the importer with the following: + +``` +docker run -v mg_lib:/var/lib/memgraph -v mg_etc:/etc/memgraph -v mg_import:/import-data \ + --entrypoint=mg_import_csv memgraph \ + --nodes=/import-data/comment_nodes.csv --nodes=/import-data/forum_nodes.csv \ + --relationships=/import-data/relationships.csv +``` + +Next time you run Memgraph, the dataset will be loaded. + diff --git a/tools/src/mg_import_csv/main.cpp b/tools/src/mg_import_csv/main.cpp index 181ec9f0f..6e02eed1d 100644 --- a/tools/src/mg_import_csv/main.cpp +++ b/tools/src/mg_import_csv/main.cpp @@ -46,8 +46,22 @@ auto ParseRepeatedFlag(const std::string &flagname, int argc, char *argv[]) { std::vector values; for (int i = 1; i < argc; ++i) { std::string flag(argv[i]); - if ((flag == "--" + flagname || flag == "-" + flagname) && i + 1 < argc) - values.push_back(argv[++i]); + int matched_flag_dashes = 0; + if (utils::StartsWith(flag, "--" + flagname)) + matched_flag_dashes = 2; + else if (utils::StartsWith(flag, "-" + flagname)) + matched_flag_dashes = 1; + // Get the value if we matched the flag. + if (matched_flag_dashes != 0) { + std::string value; + auto maybe_value = flag.substr(flagname.size() + matched_flag_dashes); + if (maybe_value.empty() && i + 1 < argc) + value = argv[++i]; + else if (!maybe_value.empty() && maybe_value.front() == '=') + value = maybe_value.substr(1); + CHECK(!value.empty()) << "The argument '" << flagname << "' is required"; + values.push_back(value); + } } return values; } @@ -385,11 +399,12 @@ std::string GetOutputPath() { // other flags which are defined in this file. LoadConfig(); // Without durability_directory, we have to require 'out' flag. - if (utils::Trim(FLAGS_durability_directory).empty()) + auto durability_dir = utils::Trim(FLAGS_durability_directory); + if (durability_dir.empty()) LOG(FATAL) << "Unable to determine snapshot output location. Please, " "provide the 'out' flag"; - std::string snapshot_dir = FLAGS_durability_directory + "/snapshots"; try { + auto snapshot_dir = durability_dir + "/snapshots"; if (!std::experimental::filesystem::exists(snapshot_dir) && !std::experimental::filesystem::create_directories(snapshot_dir)) { LOG(FATAL) << fmt::format("Cannot create snapshot directory '{}'", @@ -398,7 +413,7 @@ std::string GetOutputPath() { } catch (const std::experimental::filesystem::filesystem_error &error) { LOG(FATAL) << error.what(); } - return std::string(durability::MakeSnapshotPath(snapshot_dir)); + return std::string(durability::MakeSnapshotPath(durability_dir)); } int main(int argc, char *argv[]) { diff --git a/tools/tests/test_mg_import_csv b/tools/tests/test_mg_import_csv index c96f49286..d8f58f50b 100755 --- a/tools/tests/test_mg_import_csv +++ b/tools/tests/test_mg_import_csv @@ -31,7 +31,8 @@ def main(): os.makedirs(snapshot_dir, exist_ok=True) out_snapshot = os.path.join(snapshot_dir, 'snapshot') mg_import_csv = [args.mg_import_csv, '--nodes', comment_nodes, - '--nodes', forum_nodes, '--relationships', relationships_0, + '--nodes={}'.format(forum_nodes), + '--relationships={}'.format(relationships_0), '--relationships', relationships_1, '--out', out_snapshot, '--csv-delimiter=|', '--array-delimiter=;'] subprocess.check_call(mg_import_csv)