We are using Databricks without Unity Catalog enabled, so we don’t have access to the INFORMATION_SCHEMA objects.
The commands SHOW VIEWS IN [Schema] and SHOW TABLES IN [Schema] works however. We built some macros around this to do a weekly cleanup of all orphan tables/views that are no longer referenced by dbt.
The macros called from our cleanup job are:
- drop_unused_tables_and_views
- drop_empty_schemas
The other macros are utility functions…
Gets a list of database tables and views from Databricks:
{% macro get_all_tables_and_views() %}
{#
This macro returns a list of tables and views from the databases as well as the schema name
Note:
For some reason Databricks will sometimes list the same schema/name in both SHOW TABLES and SHOW VIEWS
It seems to work better when views are loaded first and subsequently later deleted first in drop_unused_tables_and_views macro
#}
{{ log( 'get_all_tables_and_views: Loading views and tables from database' , info=true) }}
{% set schema_results = run_query('SHOW DATABASES') %}
{% set all_schemas = schema_results.columns[0].values() %}
{% set all_tables_and_views = [] %}
{% for schema in all_schemas %}
{% set view_results = run_query('SHOW VIEWS IN ' ~ schema) %}
{% set views_in_schema = view_results.columns[1].values() %}
{% for name in views_in_schema %}
{% set view = {'schema': schema, 'name': name, 'type': 'view'} %}
{{ log( view , info=true) }}
{% do all_tables_and_views.append(view) %}
{% endfor %}
{% set table_results = run_query('SHOW TABLES IN ' ~ schema) %}
{% set tables_in_schema = table_results.columns[1].values() %}
{% for name in tables_in_schema %}
{% set table = {'schema': schema, 'name': name, 'type': 'table'} %}
{{ log( table , info=true) }}
{% do all_tables_and_views.append(table) %}
{% endfor %}
{% endfor %}
{{ return(all_tables_and_views) }}
{% endmacro %}
Gets a list of all models from the dbt manifest that are materialized as views or tables (incl. incremental):
{% macro get_dbt_nodes() %}
{#
This macro parses the dbt model graph and returns all tables and views
Note:
Only models materialized as view, table or incremental are considered.
Incremental models are treated as tables.
#}
{{ log( 'get_dbt_nodes: Loading views and tables from dbt graph' , info=true) }}
{% set dbt_tables_and_views = [] %}
{% set table_materializations = ["table", "incremental"] %}
{% set all_materializations = ["view", "table", "incremental"] %}
{% for node in graph.nodes.values() %}
{% if node.resource_type == 'model' and node.config.get('materialized', 'none') in all_materializations %}
{# Set the node_type type as either table or view. #}
{% if node.config.get('materialized', 'table') in table_materializations %}
{% set node_type = "table" %}
{% else %}
{% set node_type = "view" %}
{% endif %}
{% set node = {'schema': node.schema, 'name': node.name, 'type': node_type} %}
{{ log( node , info=true) }}
{% do dbt_tables_and_views.append(node) %}
{% endif %}
{% endfor %}
{{ return(dbt_tables_and_views) }}
{% endmacro %}
Get a list of all schemas that are referenced by dbt sources (we want to skip cleanup of those):
{% macro get_dbt_sources_schemas() %}
{#
This macro returns a list of all schema names contained within the dbt model graph
#}
{% set dbt_sources_schemas = [] %}
{% for node in graph.sources.values() %}
{% if node.schema is not none %}
{% if node.schema not in dbt_sources_schemas %}
{% set msg ='Found source schema: ' + node.schema %}
{{ log( msg , info=true) }}
{% do dbt_sources_schemas.append(node.schema.lower()) %}
{% endif %}
{% endif %}
{% endfor %}
{{ return(dbt_sources_schemas) }}
{% endmacro %}
Delete all database tables and views not referenced by any dbt model (objects referenced by dbt sources are skipped):
{% macro drop_unused_tables_and_views(dry_run=true) %}
{#
This macro will drop all tables and views from the database that is not referenced by dbt.
Note: objects belonging to schemas containing dbt sources will remain untouched by this operation.
#}
{{ log( 'Dropping unknown tables and views. dry_run: ' ~ dry_run , info=true) }}
{% set all_tables_and_views = get_all_tables_and_views() %}
{% set dbt_tables_and_views = get_dbt_nodes() %}
{% set ns = namespace(existsInDbt=false, query='') %}
{% set dbt_sources_schemas = get_dbt_sources_schemas() %}
{{ log( 'Starting to drop non-dbt tables and views. dry_run: ' ~ dry_run , info=true) }}
{% for relation in all_tables_and_views %}
{% if relation.schema not in dbt_sources_schemas %}
{% set ns.existsInDbt = false %}
{% for node in dbt_tables_and_views %}
{% if relation.schema == node.schema and relation.name == node.name%}
{% set ns.existsInDbt = true %}
{% endif %}
{% endfor %}
{# Only drop objects that does not exist in dbt #}
{% if not ns.existsInDbt %}
{% if relation.type == 'view' %}
{% set ns.query = 'DROP VIEW IF EXISTS ' + relation.schema + '.' + relation.name %}
{% else %}
{% set ns.query = 'DROP TABLE IF EXISTS ' + relation.schema + '.' + relation.name %}
{% endif %}
{{ log( ns.query , info=true) }}
{% if not dry_run %}
{% do run_query(ns.query) %}
{% endif %}
{% endif %}
{% endif %}
{% endfor %}
{% endmacro %}
Delete all database schemas that no longer contains any tables/views:
{% macro drop_empty_schemas(dry_run=true) %}
{#
This macro checks all current database schemas and drops those that does not
contain any views or tables
#}
{{ log( 'drop_empty_schemas: Dropping all schemas from database that contains no table/views' , info=true) }}
{% set schema_results = run_query('SHOW DATABASES') %}
{% set all_schemas = schema_results.columns[0].values() %}
{% set all_tables_and_views = [] %}
{% for schema in all_schemas %}
{# Cant drop default DB! #}
{% if schema != "default" %}
{% set view_results = run_query('SHOW VIEWS IN ' ~ schema) %}
{% set table_results = run_query('SHOW TABLES IN ' ~ schema) %}
{# Only drop if schema is empty #}
{% if view_results|length == 0 and table_results|length == 0 %}
{% set query = 'DROP DATABASE ' + schema %}
{{ log( query , info=true) }}
{% if not dry_run %}
{% do run_query(query) %}
{% endif %}
{% endif %}
{% endif %}
{% endfor %}
{% endmacro %}