Identifying root nodes using manifest.json

Hi, this is a follow up post for Is there a way either or show only the most left handed sources/models in the DAG?. I cannot reply to that post anymore as it is closed. However, thanks for the suggestion to use the manifest.json to find out the root node of a complex dbt dag. I went further and wrote the code for the same using python. Wanted to leave a note here so that if someone in need of it, can reference it.

import json
f = open('/Users/Humapathy/Desktop/manifest.json') # <location of the manifest.json file along with the path>
d=json.load(f)

node_to_analyze = 'model.openfit.dim_platform' # <The node in the dbt document dag for which the root notes (with no predecessors) to be found> 

# Fetch all the immediate 1st order parents of node in question.
for i in d['nodes']:
    if i==node_to_analyze:
        j=d['nodes'][i]
        pred_nodes = j['depends_on']['nodes']
print (pred_nodes)
# Initializing variables
pred_nodes1 = []
pred_nodes1_removed = []
root_parent_node =[]
x = True

#Loop through the 1st order parent nodes, if they in turn have first order parents, add them to the loop (keep looping), if they DO not have first order parents, then note them as root parents and remove them from loop.  

while x:
    for k in pred_nodes:
        l = d['nodes'][k]
        if (l['depends_on']['nodes'] ==[]):
            pred_nodes1_removed.append(k)
            if k in pred_nodes1:
                pred_nodes1.remove(k)
            root_parent_node.append(k)
        else:
            if pred_nodes1_removed:
                for m in l['depends_on']['nodes']:
                    if m not in pred_nodes1_removed: 
                        if m not in pred_nodes1:
                            pred_nodes1.append(m)
            else:
                for m in l['depends_on']['nodes']:
                    if m not in pred_nodes1:
                        pred_nodes1.append(m)
            pred_nodes1_removed.append(k)
            if k in pred_nodes1:
                pred_nodes1.remove(k)
    pred_nodes = pred_nodes1
    if pred_nodes1 == []:
        x = False
        break

print('Root Parent Nodes For ', node_to_analyze)
print('\n'.join(str(el) for el in root_parent_node ))
1 Like

Here’s Snowflake SQL to produce a table of model-descendent pairs (including self for ease-of-use).

The harder part is probably getting the manifest’s node_models, child_map, and run_results in Snowflake in the first place.

We join this with Snowflake query history to identify models that are un-unsed in terms of directly accessing queries AND indirectly accessing queries (of downstream models).

{{
  config(
    materialized='table'
  )
}}


with run_nodes as (
  select * from {{ ref('stg_dbt_manifest__nodes_models') }}
)

, run_node_children as (
  select * from {{ ref('stg_dbt_manifest__child_map') }}
)

, nodes as (

  select
      run_nodes.node
    , run_nodes.run_id
    , run_nodes.table_sk
    
  from run_nodes
  
  where run_nodes.run_id = (select max(run_id) from run_nodes)
    and run_nodes.resource_type = 'model'
    and run_nodes.database = 'dbt_prod'
    and run_nodes.schema in  ('core', 'analytics_engineering', 'staging', 'intermediate')
    and run_nodes.run_last_update > current_date - 5

)

, node_children as (

  select 
      run_node_children.node
    , run_node_children.child as descendant
      
  from run_node_children
  -- to apply node filters from above
  inner join nodes as node
    on run_node_children.node = node.node
  inner join nodes as child
    on run_node_children.child = child.node

  where run_node_children.run_id = (select max(run_id) from run_node_children)

)

, node_descendants_recursive (node, descendant, depth, path_array) as (

  select
      node
    , descendant
    , 1 as depth
    , array_construct(node, descendant) as path_array

  from node_children

    union all
  
  select 
      node_descendants_recursive.node
    , node_children.descendant
    , node_descendants_recursive.depth + 1 as depth
    , array_cat(node_descendants_recursive.path_array, to_array(node_children.descendant)) as path_array

  from node_descendants_recursive
  inner join node_children
    on node_descendants_recursive.descendant = node_children.node

  where node_descendants_recursive.depth < 50 -- recursion hard-stop, current DAG depth is ~25

)

, node_descendants_and_self as (

  select 
    node
    , descendant
    , depth
    , path_array

  from node_descendants_recursive
  
  union all 

  select
    node
    , node as descendant
    , 0 as depth
    , to_array(node) as path_array 

  from nodes

)

, node_descendants as (

  select 
      node
    , descendant
    , count(*) as count_paths
    , min(depth) as min_depth
    , max(depth) as max_depth
    , array_agg(path_array) as path_arrays

  from node_descendants_and_self

  group by 1,2

)

, node_descendants_path as (

  select 
      node
    , descendant
    , path_array
    , row_number() over (partition by node, descendant order by depth asc, path_array asc) = 1 as is_shortest_path
    , row_number() over (partition by node, descendant order by depth desc, path_array asc) = 1 as is_longest_path

  from node_descendants_and_self

)

, final as (
  
  select 
      node_descendants.node
    , node_descendants.descendant
    , concat(node_descendants.node, '-', node_descendants.descendant) as node_descendant_sk
    , node.table_sk
    , descendant.table_sk as descendant_table_sk
    , concat(node.table_sk, '-', descendant.table_sk) as node_descendant_table_sk

    , node_descendants.min_depth
    , node_descendants.max_depth
    , node_descendants.count_paths
    , node_descendants.path_arrays
    , shortest_path.path_array as shortest_path_array
    , longest_path.path_array as longest_path_array

  from node_descendants
  inner join nodes as node
    on node_descendants.node = node.node
  inner join nodes as descendant
    on node_descendants.descendant = descendant.node
  inner join node_descendants_path as shortest_path
    on node_descendants.node = shortest_path.node
    and node_descendants.descendant = shortest_path.descendant
    and shortest_path.is_shortest_path
  inner join node_descendants_path as longest_path
    on node_descenda
1 Like