Berkeley-Data · oropezaev · Feb 9, 2021 · Feb 11, 2021 · Feb 11, 2021 · Feb 12, 2021
diff --git a/notebooks/4.0-eo-initial-eurosat-data-exploration.ipynb b/notebooks/4.0-eo-initial-eurosat-data-exploration.ipynb
diff --git a/references/ML_pipeline.md b/references/ML_pipeline.md
@@ -5,8 +5,14 @@
 - Train images:269695
 - Test images:125866
 - Validation images:123723
-- Balanced irrigation: 6606
-- Balanced vineyards: 4758
+
+- Balanced train images:13932
+- Balanced test images:6604
+- Balanced val images:6606
+
+- Balanced vineyard train images:9790
+- Balanced vineyard test images:4500
+- Balanced vineyard val images:4758
 
 #### EDA 
 # Integrating and Exploring the Combined MSI Dataset for California

diff --git a/src/data/bemodels b/src/data/bemodels
@@ -0,0 +1 @@
+/workspace/app/data/raw/bigearthnet-models/
diff --git a/src/data/count_tfrecords.py b/src/data/count_tfrecords.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import pandas as pd
+import tensorflow as tf
+
+print(pd.__version__)
+print(tf.__version__)
+
+tf_file_path_list = ['balanced_train.tfrecord', 'balanced_test.tfrecord','balanced_val.tfrecord',
+                     'balanced_vy_train.tfrecord', 'balanced_vy_test.tfrecord', 'balanced_vy_val.tfrecord'
+                     ]
+for csv_file in tf_file_path_list:
+    tf_records_filename = '/workspace/app/data/processed/'+csv_file
+    c = 0
+    for record in tf.python.python_io.tf_record_iterator(tf_records_filename):
+        c += 1
+    print (csv_file,c)
+
diff --git a/src/data/make_data_eurosat.ipynb b/src/data/make_data_eurosat.ipynb
@@ -0,0 +1,323 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!/usr/bin/env python\n",
+    "# coding: utf-8"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import argparse\n",
+    "import csv\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "import rasterio\n",
+    "import tensorflow as tf\n",
+    "from glob import glob\n",
+    "from tqdm import tqdm\n",
+    "from random import choices"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Path to the BigEarthNet extracted files\n",
+    "eurosat_path = '/workspace/app/data/raw/EuroSat/fulldata/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Models folder is already checkin. No need to download the models\n",
+    "eurosat_folder = '/workspace/app/data/raw/eurosat-models/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Stores the TFRecords\n",
+    "out_folder = '/workspace/app/data/processed/EuroSat'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(eurosat_path):\n",
+    "    print('folder', eurosat_path, 'does not exist')\n",
+    "    print('Downloading Data...')\n",
+    "    # Downloads the data from EuroSat website\n",
+    "    # os.system(\"curl http://madm.dfki.de/files/sentinel/EuroSAT.zip -o /data/raw/eurosat_rgb.zip\")\n",
+    "#     os.system(\"unzip /data/raw/eurosat_rgb.zip -d /data/raw\")\n",
+    "#     os.mkdir(eurosat_path)\n",
+    "#     os.rename(\"/data/raw/ds/..../*)   "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(eurosat_folder):\n",
+    "    print('ERROR: folder', eurosat_folder, 'does not exist')\n",
+    "    os.mkdir(eurosat_folder)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(out_folder):\n",
+    "    print('ERROR: folder', out_folder, 'does not exist')\n",
+    "    os.mkdir(out_folder)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using Python Version: 1.1.5\n",
+      "Using TensorFlow Version: 2.3.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f'Using Python Version: {pd.__version__}')\n",
+    "print(f'Using TensorFlow Version: {tf.__version__}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cont = 0\n",
+    "label_list = os.listdir(eurosat_path)\n",
+    "label_indices = {'original_labels':{}}\n",
+    "for lbl in label_list:\n",
+    "    label_indices['original_labels'][lbl] = cont\n",
+    "    cont += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prep_example_eurosat(bands, original_labels, original_labels_multi_hot, patch_name):\n",
+    "    return tf.train.Example(\n",
+    "            features=tf.train.Features(\n",
+    "                feature={\n",
+    "                    'B01': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B01']))),\n",
+    "                    'B02': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B02']))),\n",
+    "                    'B03': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B03']))),\n",
+    "                    'B04': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B04']))),\n",
+    "                    'B05': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B05']))),\n",
+    "                    'B06': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B06']))),\n",
+    "                    'B07': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B07']))),\n",
+    "                    'B08': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B08']))),\n",
+    "                    'B8A': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B8A']))),\n",
+    "                    'B09': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B09']))),\n",
+    "                    'B11': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B11']))),\n",
+    "                    'B12': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=np.ravel(bands['B12']))),\n",
+    "                    'original_labels': tf.train.Feature(\n",
+    "                        bytes_list=tf.train.BytesList(\n",
+    "                            value=[i.encode('utf-8') for i in original_labels])),\n",
+    "                    'original_labels_multi_hot': tf.train.Feature(\n",
+    "                        int64_list=tf.train.Int64List(value=original_labels_multi_hot)),\n",
+    "                    'patch_name': tf.train.Feature(\n",
+    "                        bytes_list=tf.train.BytesList(value=[patch_name.encode('utf-8')]))\n",
+    "                }))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      " Label: Pasture \n",
+      "\n",
+      "Folder Pasture exist already\n",
+      "1996/2000 [============================>.] - ETA: 0s\n",
+      " Label: Industrial \n",
+      "\n",
+      "Folder Industrial exist already\n",
+      "2499/2500 [============================>.] - ETA: 0s\n",
+      " Label: PermanentCrop \n",
+      "\n",
+      "Folder PermanentCrop exist already\n",
+      "2496/2500 [============================>.] - ETA: 0s\n",
+      " Label: AnnualCrop \n",
+      "\n",
+      "Folder AnnualCrop exist already\n",
+      "2998/3000 [============================>.] - ETA: 0s\n",
+      " Label: Highway \n",
+      "\n",
+      "Folder Highway exist already\n",
+      "2497/2500 [============================>.] - ETA: 0s\n",
+      " Label: HerbaceousVegetation \n",
+      "\n",
+      "Folder HerbaceousVegetation exist already\n",
+      "2996/3000 [============================>.] - ETA: 0s\n",
+      " Label: Residential \n",
+      "\n",
+      "Folder Residential exist already\n",
+      "2999/3000 [============================>.] - ETA: 0s\n",
+      " Label: Forest \n",
+      "\n",
+      "Folder Forest exist already\n",
+      "2997/3000 [============================>.] - ETA: 0s\n",
+      " Label: River \n",
+      "\n",
+      "Folder River exist already\n",
+      "2498/2500 [============================>.] - ETA: 0s\n",
+      " Label: SeaLake \n",
+      "\n",
+      "Folder SeaLake exist already\n",
+      "2998/3000 [============================>.] - ETA: 0sCompleted!!!\n"
+     ]
+    }
+   ],
+   "source": [
+    "bands_l =['B01','B02','B03','B04','B05','B06','B07','B08',\n",
+    "          'B09','B10','B11','B12','B8A']\n",
+    "\n",
+    "# Split options [Train = 0, Test= 1, Validation= 2]\n",
+    "split_option = [0,1,2]\n",
+    "# Split Probabilities [Train = 50%, Test= 25%, Validation= 25%]\n",
+    "weights = [0.5,0.25, 0.25]\n",
+    "\n",
+    "# TFRecords Writers\n",
+    "TFRec_writer_train = tf.io.TFRecordWriter(os.path.join(out_folder, 'train.tfrecord'))\n",
+    "TFRec_writer_test = tf.io.TFRecordWriter(os.path.join(out_folder, 'test.tfrecord'))\n",
+    "TFRec_writer_val = tf.io.TFRecordWriter(os.path.join(out_folder, 'val.tfrecord'))\n",
+    "patch_name = out_folder\n",
+    "for tifile in label_list:\n",
+    "    # create Folder\n",
+    "    print('\\n Label: {} \\n'.format(tifile))\n",
+    "\n",
+    "    try:\n",
+    "        os.mkdir(patch_name)\n",
+    "    except:\n",
+    "        print('Folder {} exist already'.format(tifile))\n",
+    "        \n",
+    "    # create labels\n",
+    "    original_labels = [tifile]\n",
+    "    \n",
+    "    # hot encode label\n",
+    "    original_labels_multi_hot = np.zeros(len(label_list),dtype=int)\n",
+    "    lidx = label_indices['original_labels'][tifile]\n",
+    "    original_labels_multi_hot[lidx] = 1\n",
+    "\n",
+    "    # loop in the folder\n",
+    "    files_list = os.listdir(os.path.join(eurosat_path,tifile))\n",
+    "   \n",
+    "    # write holder tfrecord\n",
+    "    progress_bar = tf.keras.utils.Progbar(target = len(files_list))\n",
+    "    for findex,fex in enumerate(files_list):\n",
+    "        sname = fex.split('_')[1][:-4]\n",
+    "        band_ds = rasterio.open(os.path.join(eurosat_path,tifile,fex))\n",
+    "        bands = {}\n",
+    "        for idx in range(13): # 13 bands\n",
+    "            if bands_l[idx] == 'B10':\n",
+    "                continue\n",
+    "            bands[bands_l[idx]] = np.array(band_ds.read(idx+1))\n",
+    "\n",
+    "        # prep example dev example\n",
+    "        example = prep_example_eurosat(bands,\n",
+    "                                       original_labels, \n",
+    "                                       original_labels_multi_hot, \n",
+    "                                       patch_name)\n",
+    "        \n",
+    "        pick = choices(split_option,weights)[0]\n",
+    "        if pick == 0:\n",
+    "            TFRec_writer_train.write(example.SerializeToString()) # This is the full path to tfrecord train\n",
+    "        elif pick == 1:\n",
+    "            TFRec_writer_test.write(example.SerializeToString()) # This is the full path to tfrecord test\n",
+    "        else:\n",
+    "            TFRec_writer_val.write(example.SerializeToString()) # This is the full path to tfrecord val           \n",
+    "            \n",
+    "        progress_bar.update(findex)\n",
+    "\n",
+    "print('Completed!!!')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}