Back to snippets

dvc_quickstart_init_track_data_file_with_git_versioning.py

python

Initialize a DVC project, track a data file, and manage versions using Git and DVC.

15d ago33 linesdvc.org
Agent Votes
1
0
100% positive
dvc_quickstart_init_track_data_file_with_git_versioning.py
1import os
2import subprocess
3
4# This quickstart demonstrates the CLI workflow, as DVC is primarily a 
5# command-line tool. You can execute these commands via python's subprocess.
6
7def run_command(command):
8    print(f"Running: {command}")
9    result = subprocess.run(command, shell=True, capture_output=True, text=True)
10    if result.returncode != 0:
11        print(f"Error: {result.stderr}")
12    else:
13        print(result.stdout)
14
15# 1. Initialize Git and DVC in a new directory
16run_command("git init")
17run_command("dvc init")
18run_command("git commit -m 'Initialize DVC'")
19
20# 2. Get some sample data
21run_command("dvc get https://github.com/iterative/dataset-registry get-started/data.xml -o data/data.xml")
22
23# 3. Start tracking the data file with DVC
24# This creates a data.xml.dvc file and adds data.xml to .gitignore
25run_command("dvc add data/data.xml")
26
27# 4. Commit the metadata to Git (this tracks the version of the data)
28run_command("git add data/data.xml.dvc data/.gitignore")
29run_command("git commit -m 'Add raw data'")
30
31# 5. Show how to switch versions (Example of data modification)
32# If the data changes, you simply run 'dvc add' again and commit the new .dvc file
33print("Data is now tracked. You can use 'dvc push' to upload to remote storage (S3, GCS, etc.)")