If you are planning a relatively short training period (less than 24 hours), you may want to create a cheaper preemptible instance:
gcloud compute instances create gcp_instance_name \
--image ubuntu-1804-bionic-v20200916 \
--image-project ubuntu-os-cloud \
--boot-disk-device-name=boot_disk_name \
--boot-disk-size=150 \
--machine-type=n1-highmem-4 \
--accelerator=count=1,type=nvidia-tesla-t4 \
--maintenance-policy TERMINATE \
--boot-disk-type=pd-standard \
--network-interface subnet=default-subnet \
--metadata-from-file startup-script=startup_gcp.sh \
--preemptible
The startup script will install Docker and a few more useful libraries on your newly created machine:
echo "startup-script"
echo "Set locals and timezone"
sudo locale-gen "en_US.UTF-8"
sudo dpkg-reconfigure locales
sudo timedatectl set-timezone Asia/Singapore
# https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html
echo "NVIDIA Driver Installation"
sudo apt-get install linux-headers-$(uname -r)
distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g')
wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-$distribution.pin
sudo mv cuda-$distribution.pin /etc/apt/preferences.d/cuda-repository-pin-600
sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/7fa2af80.pub
echo "deb http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda.list
sudo apt-get update
sudo apt-get -y install cuda-drivers
nvidia-smi
echo ":trying to remove docker engine (if exists)"
sudo apt-get remove docker docker-engine docker.io containerd runc
echo ":apt-get update"
sudo apt-get update
sudo apt-get install -y \
apt-transport-https \
ca-certificates \
curl \
gnupg-agent \
software-properties-common
echo ":curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -"
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
echo ":apt-key fingerprint 0EBFCD88"
sudo apt-key fingerprint 0EBFCD88
echo ":sudo add-apt-repository..."
sudo add-apt-repository \
"deb [arch=amd64] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) \
stable"
sudo apt-get update
echo ":sudo apt-get install -y docker-ce docker-ce-cli containerd.io"
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
# https://docs.docker.com/engine/install/linux-postinstall/
echo ":using docker without sudo"
sudo groupadd docker
sudo usermod -aG docker $USER
newgrp docker
echo Configure authentication to Container Registry.
# https://cloud.google.com/container-registry/docs/advanced-authentication
VERSION=2.0.2
OS=linux
ARCH=amd64
echo ":curl -L ..."
curl -L "https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v${VERSION}/docker-credential-gcr_${OS}_${ARCH}-${VERSION}.tar.gz" -o docker-credential-gcr_linux_amd64-2.0.2.tar.gz
echo ":tar xvf ..."
tar xvf "./docker-credential-gcr_${OS}_${ARCH}-${VERSION}.tar.gz"
echo ":sudo cp ./docker-credential-gcr /usr/local/bin/docker-credential-gcr"
sudo cp ./docker-credential-gcr /usr/local/bin/docker-credential-gcr
echo ":chmod +x /usr/local/bin/docker-credential-gcr"
sudo chmod +x /usr/local/bin/docker-credential-gcr
echo ":docker-credential-gcr configure-docker"
docker-credential-gcr configure-docker
echo "Install nvidia-docker"
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update
sudo apt-get install -y nvidia-docker2
sudo systemctl restart docker
sudo apt install zip unzip
The docker file may look like this:
# We start with specifying our base image. Use the FROM keyword to do that -
# FROM tensorflow/tensorflow:2.3.0-gpu
# FROM tensorflow/tensorflow:latest-gpu
FROM tensorflow/tensorflow:nightly-gpu
RUN apt-get install -y locales
RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && locale-gen
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
# First, we set a working directory and then copy all the files for our app.
WORKDIR /usr/src/app
# copy all the files to the container
# adds files from your Docker client’s current directory.
COPY . .
RUN python3 -m pip install --upgrade pip
# install dependencies
# RUN pip install -r requirements.txt
RUN pip install numpy pandas sklearn matplotlib pandas_gbq
RUN apt-get install -y nano
RUN DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata
RUN ln -fs /usr/share/zoneinfo/Asia/Singapore /etc/localtime
RUN dpkg-reconfigure -f noninteractive tzdata
For cases where you will need to share directories between the host and the Docker, use:
docker run --name=docker_instance_name --gpus all -d -v ....
Based on this reference:
-v --volume=[host-src:]container-dest[:<options>]: Bind mount a volume.
-d to start a container in detached mode
--gpus GPU devices to add to the container (‘all’ to pass all GPUs)