https://github.com/mneedham/spark-summit-2019-demo
https://github.com/mneedham/spark-summit-2019-demo
Last synced: 12 months ago
JSON representation
- Host: GitHub
- URL: https://github.com/mneedham/spark-summit-2019-demo
- Owner: mneedham
- Created: 2019-04-10T12:16:38.000Z (about 7 years ago)
- Default Branch: master
- Last Pushed: 2019-04-10T14:06:09.000Z (about 7 years ago)
- Last Synced: 2025-02-14T13:50:19.771Z (over 1 year ago)
- Language: Jupyter Notebook
- Size: 45.9 KB
- Stars: 3
- Watchers: 3
- Forks: 0
- Open Issues: 0
-
Metadata Files:
- Readme: README.adoc
Awesome Lists containing this project
README
= Spark Summit 2019 Demo
== Neo4j
=== Data Import
[source,cypher]
----
CREATE CONSTRAINT ON (c:Character) ASSERT c.id IS UNIQUE;
// Create nodes
UNWIND range(1,7) AS season
LOAD CSV WITH HEADERS FROM "https://github.com/mneedham/spark-summit-2019-demo/raw/master/data/got-s" + season + "-nodes.csv" AS row
MERGE (c:Character {id: row.Id})
ON CREATE SET c.name = row.Label;
// Create relationships
UNWIND range(1,7) AS season
LOAD CSV WITH HEADERS FROM "https://github.com/mneedham/spark-summit-2019-demo/raw/master/data/got-s" + season + "-edges.csv" AS row
MATCH (source:Character {id: row.Source})
MATCH (target:Character {id: row.Target})
CALL apoc.merge.relationship(source, "INTERACTS_SEASON" + season, {}, {}, target) YIELD rel
SET rel.weight = toInteger(row.Weight)
----
== Apache Spark
* Download https://www.apache.org/dyn/closer.lua/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz[Apache Spark 2.4.1^]
[source,bash]
----
pip install -r requirements.txt
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
----
[source,bash]
----
export SPARK_VERSION="spark-2.4.1-bin-hadoop2.7"
./${SPARK_VERSION}/bin/pyspark \
--driver-memory 2g \
--executor-memory 6g \
--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11
----