An open API service indexing awesome lists of open source software.

https://github.com/mneedham/spark-summit-2019-demo


https://github.com/mneedham/spark-summit-2019-demo

Last synced: 12 months ago
JSON representation

Awesome Lists containing this project

README

          

= Spark Summit 2019 Demo

== Neo4j

=== Data Import

[source,cypher]
----
CREATE CONSTRAINT ON (c:Character) ASSERT c.id IS UNIQUE;

// Create nodes
UNWIND range(1,7) AS season
LOAD CSV WITH HEADERS FROM "https://github.com/mneedham/spark-summit-2019-demo/raw/master/data/got-s" + season + "-nodes.csv" AS row
MERGE (c:Character {id: row.Id})
ON CREATE SET c.name = row.Label;

// Create relationships
UNWIND range(1,7) AS season
LOAD CSV WITH HEADERS FROM "https://github.com/mneedham/spark-summit-2019-demo/raw/master/data/got-s" + season + "-edges.csv" AS row
MATCH (source:Character {id: row.Source})
MATCH (target:Character {id: row.Target})
CALL apoc.merge.relationship(source, "INTERACTS_SEASON" + season, {}, {}, target) YIELD rel
SET rel.weight = toInteger(row.Weight)
----

== Apache Spark

* Download https://www.apache.org/dyn/closer.lua/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz[Apache Spark 2.4.1^]

[source,bash]
----
pip install -r requirements.txt
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
----

[source,bash]
----
export SPARK_VERSION="spark-2.4.1-bin-hadoop2.7"
./${SPARK_VERSION}/bin/pyspark \
--driver-memory 2g \
--executor-memory 6g \
--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11
----