diff --git a/cell-tower-anomaly-detection-dbt/cell-tower-anomaly-detection-dbt/02-config/terraform/main.tf b/cell-tower-anomaly-detection-dbt/cell-tower-anomaly-detection-dbt/02-config/terraform/main.tf index 22b0485..38b399b 100644 --- a/cell-tower-anomaly-detection-dbt/cell-tower-anomaly-detection-dbt/02-config/terraform/main.tf +++ b/cell-tower-anomaly-detection-dbt/cell-tower-anomaly-detection-dbt/02-config/terraform/main.tf @@ -125,7 +125,7 @@ resource "google_storage_bucket" "gcs-bucket" { force_destroy = true provisioner "local-exec" { interpreter = ["/bin/bash" ,"-c"] - command = "gsutil cp '${var.src_customer_data}' '${var.dst_customer_data}'/ && gsutil cp '${var.src_service_data}' '${var.dst_service_data}'/ && gsutil cp '${var.src_telecom_data}' '${var.dst_telecom_data}'/" + command = "gcloud storage cp '${var.src_customer_data}' '${var.dst_customer_data}'/ && gcloud storage cp '${var.src_service_data}' '${var.dst_service_data}'/ && gcloud storage cp '${var.src_telecom_data}' '${var.dst_telecom_data}'/" } } diff --git a/customer_churn_biglake/instructions/04-files-upload.md b/customer_churn_biglake/instructions/04-files-upload.md index fcc98ad..13bb207 100644 --- a/customer_churn_biglake/instructions/04-files-upload.md +++ b/customer_churn_biglake/instructions/04-files-upload.md @@ -48,7 +48,7 @@ Run the following gcloud command in Cloud Shell to create the bucket to store da
``` -gsutil mb -p $PROJECT_ID -c STANDARD -l $REGION -b on gs://$BUCKET_CODE +gcloud storage buckets create gs://$BUCKET_CODE --project=$PROJECT_ID --default-storage-class=STANDARD --location=$REGION --uniform-bucket-level-access ```
@@ -88,7 +88,7 @@ bq show --connection $PROJECT_ID.$REGION.$CONNECTION_ID Grant the necessary permissions to the Service Account:
``` -gsutil iam ch serviceAccount::objectViewer gs://$BUCKET_CODE +gcloud storage buckets add-iam-policy-binding gs://$BUCKET_CODE --member=serviceAccount: --role=roles/storage.objectViewer ``` ## 6. Create a BigLake table: diff --git a/s8s-prerequisites/01-instructions/01-terraform-instructions.md b/s8s-prerequisites/01-instructions/01-terraform-instructions.md index f141945..9af3818 100644 --- a/s8s-prerequisites/01-instructions/01-terraform-instructions.md +++ b/s8s-prerequisites/01-instructions/01-terraform-instructions.md @@ -115,7 +115,7 @@ PROJECT_ID=`gcloud config list --format "value(core.project)" 2>/dev/null` PROJECT_NBR=`gcloud projects describe $PROJECT_ID | grep projectNumber | cut -d':' -f2 | tr -d "'" | xargs` cd ~/serverless-spark-workshop/s8s-prerequisites/00-scripts-and-config/terraform terraform output > resource-list.txt -gsutil cp resource-list.txt gs://s8s-code-and-data-bucket-$PROJECT_NBR +gcloud storage cp resource-list.txt gs://s8s-code-and-data-bucket-$PROJECT_NBR ``` ## 3. Roles required for the Hackfest Attendees diff --git a/s8s-spark-mlops/02-scripts/bash/build-container-image.sh b/s8s-spark-mlops/02-scripts/bash/build-container-image.sh index c7dd8bd..6616d58 100755 --- a/s8s-spark-mlops/02-scripts/bash/build-container-image.sh +++ b/s8s-spark-mlops/02-scripts/bash/build-container-image.sh @@ -163,7 +163,7 @@ echo "Completed Dockerfile creation" # Download dependencies to be baked into image cd $LOCAL_SCRATCH_DIR -gsutil cp $BQ_CONNECTOR_JAR_URI . +gcloud storage cp $BQ_CONNECTOR_JAR_URI . wget -P . https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh echo "Completed downloading dependencies" diff --git a/serverless_spark_streaming/instructions/05a_serverless_spark_streaming_gcloud_execution.md b/serverless_spark_streaming/instructions/05a_serverless_spark_streaming_gcloud_execution.md index b5da30f..b6e65ff 100644 --- a/serverless_spark_streaming/instructions/05a_serverless_spark_streaming_gcloud_execution.md +++ b/serverless_spark_streaming/instructions/05a_serverless_spark_streaming_gcloud_execution.md @@ -126,7 +126,7 @@ BUCKET_CODE= #GCP bucket where our code, #### 4.2.2 Command to copy Files. ``` -gsutil cp gs://$BUCKET_CODE/serverless_spark_streaming/01-datasets/data_files/<> gs://$BUCKET_CODE/serverless_spark_streaming/01-datasets/streaming_data/ +gcloud storage cp gs://$BUCKET_CODE/serverless_spark_streaming/01-datasets/data_files/<> gs://$BUCKET_CODE/serverless_spark_streaming/01-datasets/streaming_data/ ``` diff --git a/serverless_spark_streaming/instructions/05b_serverless_spark_streaming_console_execution.md b/serverless_spark_streaming/instructions/05b_serverless_spark_streaming_console_execution.md index 3294d9d..b0d71c9 100644 --- a/serverless_spark_streaming/instructions/05b_serverless_spark_streaming_console_execution.md +++ b/serverless_spark_streaming/instructions/05b_serverless_spark_streaming_console_execution.md @@ -135,7 +135,7 @@ BUCKET_CODE= #GCP bucket where our code, #### 4.2.2 Command to copy Files. ``` -gsutil cp gs://$BUCKET_CODE/serverless_spark_streaming/01-datasets/data_files/<> gs://$BUCKET_CODE/serverless_spark_streaming/01-datasets/streaming_data/ +gcloud storage cp gs://$BUCKET_CODE/serverless_spark_streaming/01-datasets/data_files/<> gs://$BUCKET_CODE/serverless_spark_streaming/01-datasets/streaming_data/ ``` diff --git a/social_media_data_analytics/instructions/04-create-docker-image.md b/social_media_data_analytics/instructions/04-create-docker-image.md index dbe4c06..6676927 100644 --- a/social_media_data_analytics/instructions/04-create-docker-image.md +++ b/social_media_data_analytics/instructions/04-create-docker-image.md @@ -26,7 +26,7 @@ gcloud auth configure-docker ${REGION}-docker.pkg.dev Run the below command in VM. ``` -gsutil cp \ +gcloud storage cp \ gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar . ``` diff --git a/social_network_graph/instructions/05-create-docker-image.md b/social_network_graph/instructions/05-create-docker-image.md index 64fd35d..77fddef 100644 --- a/social_network_graph/instructions/05-create-docker-image.md +++ b/social_network_graph/instructions/05-create-docker-image.md @@ -48,7 +48,7 @@ We will use it throughout the lab.
Run the below command in VM. ``` -gsutil cp \ +gcloud storage cp \ gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar . ``` diff --git a/timeseries_forecasting/instructions/05-create-docker-image.md b/timeseries_forecasting/instructions/05-create-docker-image.md index cdc67dd..2491753 100644 --- a/timeseries_forecasting/instructions/05-create-docker-image.md +++ b/timeseries_forecasting/instructions/05-create-docker-image.md @@ -26,7 +26,7 @@ gcloud auth configure-docker ${REGION}-docker.pkg.dev Run the below command in VM. ``` -gsutil cp \ +gcloud storage cp \ gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar . ``` diff --git a/wikipedia-page-view-analysis/README.md b/wikipedia-page-view-analysis/README.md index eb2d518..505c0b7 100644 --- a/wikipedia-page-view-analysis/README.md +++ b/wikipedia-page-view-analysis/README.md @@ -1,149 +1,149 @@ -# Wikipedia Page Views Analysis from the BigQuery UI powered by Dataproc Serverless Spark - -Lab contributed by [TEKsystems](https://www.teksystems.com/en/about-us/partnerships/google-cloud) and Google. - -This lab demonstrates how to use the BigQuery UI for running Dataproc Serverless Spark jobs for data analytics. - -## 1. Prerequisite -See [this lab for an example prerequisite set up](https://github.com/GoogleCloudPlatform/serverless-spark-workshop/blob/main/malware_detection/instructions/01-gcp-prerequisites.md) or [these Terraform modules to provision and configure a Serverless Spark environment](https://github.com/anagha-google/ts22-just-enough-terraform-for-da). - -## 2. Variables - -Paste this into Google Cloud CLI in Cloud Shell after replacing with your values - -``` -PROJECT_ID=YOUR_PROJECT_ID -PROJECT_NBR=YOUR_PROJECT_NBR -BQ_UI_BUCKET_NM=gs://s8s-bigspark-$PROJECT_NBR -LOCATION=us-central1 -``` - -## 3. Storage Bucket - -A storage bucket is needed, for Serverless Spark. Lets create one- -``` -gsutil mb -p $PROJECT_ID -c STANDARD -l $LOCATION -b on $BQ_UI_BUCKET_NM -``` - -## 4. Needed in the UI - -Just the storage bucket created above. -Note: You will run the lab as your own identity. - -## 5. Wikipedia Page Views Analysis - code - -``` -# Copyright 2022 Google LLC. -# SPDX-License-Identifier: Apache-2.0 - -from pyspark.sql import SparkSession -from pyspark.ml.feature import StopWordsRemover -from pyspark.sql import functions as F - -spark = SparkSession.builder \ -.appName('Wikipedia-Analytics')\ -.getOrCreate() - -# Base dataset in BQ -bqTableFQN = "bigquery-public-data.wikipedia.pageviews_2019" - -# Read base dataset with filters -wikiPageviewsDF = spark.read \ -.format("bigquery") \ -.option("table", bqTableFQN) \ -.option("filter", "datehour >= '2019-01-01' ") \ -.load() - -# Subset the columns -pageViewsSubsetDF = wikiPageviewsDF \ -.select("title", "wiki", "views") \ -.where("views > 5") - -# Cache -pageViewsSubsetDF.cache() - -# Filter to just english -pageViewsSubsetEnglishDF = pageViewsSubsetDF \ -.where("wiki in ('en', 'en.m')") - -# Aggregate by title -pageViewsSubsetEnglishByTitleDF = pageViewsSubsetEnglishDF \ -.groupBy("title") \ -.agg(F.sum('views').alias('total_views')) - -# Order by and print -pageViewsSubsetEnglishByTitleDF.orderBy('total_views', ascending=False).show(20) -``` - -## 6. Lets get started - -### 6.1. Navigate to the BQ UI from Cloud Console - -![bq-1](images/00-bq-01.png) -
- -### 6.2. Click on create - -![bq-2](images/00-bq-02.png) -
- -### 6.3. Click on "Compose new PySpark" - -![bq-3](images/00-bq-03.png) -
- -### 6.4. Click on "More -> PySpark Options" - -![bq-4](images/00-bq-04.png) -
- -### 6.5. Click on "Browse and select staging folder" - -![bq-5](images/00-bq-05.png) -
- -### 6.6. Click on "s8s bigSpark bucket" - -![bq-6](images/00-bq-06.png) -
- -![bq-7](images/00-bq-07.png) -
- -### 6.7. Select your network and subnet - -![bq-8](images/00-bq-08.png) -
- -### 6.8. Click "Save" -![bq-9](images/00-bq-09.png) -
- -### 6.9. Paste the code snippet above, into the UI text area and click "Run" -![bq-10](images/00-bq-10.png) -
- -### 6.10. Switch to Dataproc UI "Batches" view - -You should see a new batch job- - -![bq-11](images/00-bq-11.png) -
- -### 6.11. Switch back to the BQ UI to view the results - -![bq-12](images/00-bq-12.png) -
-
-![bq-13](images/00-bq-13.png) - -### 6.12. Switch to Dataproc UI "Batches" view to look at the monitoring tile -You should see the autoscale kick in after 2 minutes -
-![bq-13](images/00-bq-14.png) -
- - -##### ===================================================================================================== -##### THIS CONCLUDES THIS LAB -##### PROGRESS TO NEXT LAB, OR SHUT DOWN RESOURCES -##### ===================================================================================================== +# Wikipedia Page Views Analysis from the BigQuery UI powered by Dataproc Serverless Spark + +Lab contributed by [TEKsystems](https://www.teksystems.com/en/about-us/partnerships/google-cloud) and Google. + +This lab demonstrates how to use the BigQuery UI for running Dataproc Serverless Spark jobs for data analytics. + +## 1. Prerequisite +See [this lab for an example prerequisite set up](https://github.com/GoogleCloudPlatform/serverless-spark-workshop/blob/main/malware_detection/instructions/01-gcp-prerequisites.md) or [these Terraform modules to provision and configure a Serverless Spark environment](https://github.com/anagha-google/ts22-just-enough-terraform-for-da). + +## 2. Variables + +Paste this into Google Cloud CLI in Cloud Shell after replacing with your values - +``` +PROJECT_ID=YOUR_PROJECT_ID +PROJECT_NBR=YOUR_PROJECT_NBR +BQ_UI_BUCKET_NM=gs://s8s-bigspark-$PROJECT_NBR +LOCATION=us-central1 +``` + +## 3. Storage Bucket + +A storage bucket is needed, for Serverless Spark. Lets create one- +``` +gcloud storage buckets create --project $PROJECT_ID --default-storage-class STANDARD --location $LOCATION --uniform-bucket-level-access $BQ_UI_BUCKET_NM +``` + +## 4. Needed in the UI + +Just the storage bucket created above. +Note: You will run the lab as your own identity. + +## 5. Wikipedia Page Views Analysis - code + +``` +# Copyright 2022 Google LLC. +# SPDX-License-Identifier: Apache-2.0 + +from pyspark.sql import SparkSession +from pyspark.ml.feature import StopWordsRemover +from pyspark.sql import functions as F + +spark = SparkSession.builder \ +.appName('Wikipedia-Analytics')\ +.getOrCreate() + +# Base dataset in BQ +bqTableFQN = "bigquery-public-data.wikipedia.pageviews_2019" + +# Read base dataset with filters +wikiPageviewsDF = spark.read \ +.format("bigquery") \ +.option("table", bqTableFQN) \ +.option("filter", "datehour >= '2019-01-01' ") \ +.load() + +# Subset the columns +pageViewsSubsetDF = wikiPageviewsDF \ +.select("title", "wiki", "views") \ +.where("views > 5") + +# Cache +pageViewsSubsetDF.cache() + +# Filter to just english +pageViewsSubsetEnglishDF = pageViewsSubsetDF \ +.where("wiki in ('en', 'en.m')") + +# Aggregate by title +pageViewsSubsetEnglishByTitleDF = pageViewsSubsetEnglishDF \ +.groupBy("title") \ +.agg(F.sum('views').alias('total_views')) + +# Order by and print +pageViewsSubsetEnglishByTitleDF.orderBy('total_views', ascending=False).show(20) +``` + +## 6. Lets get started + +### 6.1. Navigate to the BQ UI from Cloud Console + +![bq-1](images/00-bq-01.png) +
+ +### 6.2. Click on create + +![bq-2](images/00-bq-02.png) +
+ +### 6.3. Click on "Compose new PySpark" + +![bq-3](images/00-bq-03.png) +
+ +### 6.4. Click on "More -> PySpark Options" + +![bq-4](images/00-bq-04.png) +
+ +### 6.5. Click on "Browse and select staging folder" + +![bq-5](images/00-bq-05.png) +
+ +### 6.6. Click on "s8s bigSpark bucket" + +![bq-6](images/00-bq-06.png) +
+ +![bq-7](images/00-bq-07.png) +
+ +### 6.7. Select your network and subnet + +![bq-8](images/00-bq-08.png) +
+ +### 6.8. Click "Save" +![bq-9](images/00-bq-09.png) +
+ +### 6.9. Paste the code snippet above, into the UI text area and click "Run" +![bq-10](images/00-bq-10.png) +
+ +### 6.10. Switch to Dataproc UI "Batches" view + +You should see a new batch job- + +![bq-11](images/00-bq-11.png) +
+ +### 6.11. Switch back to the BQ UI to view the results + +![bq-12](images/00-bq-12.png) +
+
+![bq-13](images/00-bq-13.png) + +### 6.12. Switch to Dataproc UI "Batches" view to look at the monitoring tile +You should see the autoscale kick in after 2 minutes +
+![bq-13](images/00-bq-14.png) +
+ + +##### ===================================================================================================== +##### THIS CONCLUDES THIS LAB +##### PROGRESS TO NEXT LAB, OR SHUT DOWN RESOURCES +##### =====================================================================================================