Visualize Salesforce Data Completeness by Field

This page shows how you can use the simple-salesforce library, the Pandas Python data manipulation library, and the Matplotlob Python data vizualization library. The resulting visual is shown below.
png
To access Salesforce orgs with Python, use the python module simple-salesfoce: documentation, GitHub, library page

Install with: pip install simple_salesforce

Salesforce credentials are also required:

  • Email
  • Password
  • Security Token

Security tokens are accessible from within the Salesforce org. Go to Setting » My Personal Information » Reset my Security Token.

from simple_salesforce import Salesforce
import pandas as pd
import numpy as np
from datetime import date
import matplotlib.pyplot as plt
pd.options.display.max_rows = 999

Import Credentials for Salesforce Org

import os #use os to navigate to the directory where your config file is stored
cwd = os.getcwd()
os.chdir('/home/ryan/Documents/')
import config #import the config file
sf = Salesforce(username=config.username,
                password=config.password,
                security_token=config.security_token)
os.chdir(cwd)

List All Objects

Queries for Salesforce Objects are written as Salesforce Object Query Language (SOQL).

desc = sf.describe()
objects = []
obj_labels = [obj['label']  for obj in desc['sobjects']]
obj_names  = [obj['name']   for obj in desc['sobjects']]
obj_custom = [obj['custom'] for obj in desc['sobjects']]
for label, name, custom in zip(obj_labels, obj_names, obj_custom):
    objects.append((label, name, custom))
objects = pd.DataFrame(objects,
                       columns = ['label','name','custom'])
print(str('Objects in Org: ' + str(objects.shape[0])))
objects.head(10)
Objects in Org: 451

label name custom
0 Accepted Event Relation AcceptedEventRelation False
1 Account Account False
2 Account Change Event AccountChangeEvent False
3 Account Clean Info AccountCleanInfo False
4 Account Contact Role AccountContactRole False
5 Account Feed AccountFeed False
6 Account History AccountHistory False
7 Account Partner AccountPartner False
8 Account Share AccountShare False
9 Action Link Group Template ActionLinkGroupTemplate False

List All Fields

Example below is for Contact.

desc = sf.Contact.describe()
fields = []
field_labels = [field['label'] for field in desc['fields']]
field_names =  [field['name']  for field in desc['fields']]
field_types =  [field['type']  for field in desc['fields']]
for label, name, dtype in zip(field_labels, field_names, field_types):
    fields.append((label, name, dtype))
fields = pd.DataFrame(fields,
                      columns = ['label','name','type'])
obj = 'Contact'
field_list = ', '.join(list(fields['name']))
query = 'SELECT ' + field_list + '''
FROM ''' + obj
q = sf.query_all(query)
d = (pd.DataFrame(q['records'])
     .drop(columns=['attributes']))
print('Records, Fields: ' + str(d.shape))
pd.DataFrame(d.iloc[0].T)
Records, Fields: (20, 61)

0
Id 0034w00004CLCVKAA5
IsDeleted False
MasterRecordId None
AccountId 0014w00003gxq9xAAA
LastName Bond
FirstName John
Salutation Mr.
Name John Bond
OtherStreet None
OtherCity None
OtherState None
OtherPostalCode None
OtherCountry None
OtherLatitude None
OtherLongitude None
OtherGeocodeAccuracy None
OtherAddress None
MailingStreet 2334 N. Michigan Avenue, Suite 1500\nChicago, ...
MailingCity None
MailingState None
MailingPostalCode None
MailingCountry None
MailingLatitude None
MailingLongitude None
MailingGeocodeAccuracy None
MailingAddress {'city': None, 'country': None, 'geocodeAccura...
Phone (312) 596-1000
Fax (312) 596-1500
MobilePhone (312) 596-1563
HomePhone None
OtherPhone None
AssistantPhone None
ReportsToId None
Email [email protected]
Title VP, Facilities
Department Facilities
AssistantName None
LeadSource External Referral
Birthdate 1958-08-02
Description None
OwnerId 0054w00000BrZJ6AAN
CreatedDate 2022-10-14T11:08:03.000+0000
CreatedById 0054w00000BrZJ6AAN
LastModifiedDate 2022-10-14T11:08:03.000+0000
LastModifiedById 0054w00000BrZJ6AAN
SystemModstamp 2022-10-14T11:08:03.000+0000
LastActivityDate None
LastCURequestDate None
LastCUUpdateDate None
LastViewedDate None
LastReferencedDate None
EmailBouncedReason None
EmailBouncedDate None
IsEmailBounced False
PhotoUrl /services/images/photo/0034w00004CLCVKAA5
Jigsaw None
JigsawContactId None
CleanStatus Pending
IndividualId None
Level__c Primary
Languages__c English

Calculate Completeness Ratio for Each Field

completeness = (pd.DataFrame(d.isna().sum())
                .rename(columns={0:'complete'}))
completeness['complete'] = d.shape[0] - completeness['complete']
completeness['total'] = d.shape[0]
completeness['ratio'] = completeness['complete'].astype(float) / completeness['total'].astype(float)
completeness = completeness.reset_index().rename(columns={'index':'label'})
completeness = completeness[['label','ratio']].copy()
completeness.head()

label ratio
0 Id 1.0
1 IsDeleted 1.0
2 MasterRecordId 0.0
3 AccountId 1.0
4 LastName 1.0
fields['field'] = fields['label'] + ', ' + fields['name'] + ', ' + fields['type']
m = pd.merge(fields, completeness,
             left_on=['name'],
             right_on=['label'],
             how='left',
             indicator=True)
print(m._merge.value_counts())
m = m[m['_merge']=='both'].reset_index(drop=True)
m = (m
     .drop(columns=['label_x','label_y','name','type','_merge']))
m = (m
     .reset_index()
     .sort_values('index',  # Invert the default sort
                  ascending=False)
     .reset_index(drop=True)
     .drop(columns=['index']))
m.head()
both          61
left_only      0
right_only     0
Name: _merge, dtype: int64

field ratio
0 Languages, Languages__c, string 0.85
1 Level, Level__c, picklist 0.85
2 Individual ID, IndividualId, reference 0.00
3 Clean Status, CleanStatus, picklist 1.00
4 Jigsaw Contact ID, JigsawContactId, string 0.00

Visualize

plt.figure(figsize=(8,(d.shape[1]*1/2)), facecolor='white')
plt.barh(m['field'],
         m['ratio'],
         zorder=3)
plt.title('Data Completeness for ' + obj)
plt.ylim([-1,d.shape[1]])
plt.tick_params(
    axis='y',
    left=False)
plt.tick_params(
    axis='x',
    bottom=False)
plt.grid(True,
         axis='x',
         zorder=0)
for spine in plt.gca().spines.values():
    spine.set_visible(False)

png