Visualize Salesforce Data Completeness by Field
This page shows how you can use the simple-salesforce
library, the Pandas
Python data manipulation library, and the Matplotlob
Python data vizualization library. The resulting visual is shown below.
To access Salesforce orgs with Python, use the python module simple-salesfoce
: documentation, GitHub, library page
Install with: pip install simple_salesforce
Salesforce credentials are also required:
- Password
- Security Token
Security tokens are accessible from within the Salesforce org. Go to Setting » My Personal Information » Reset my Security Token.
from simple_salesforce import Salesforce
import pandas as pd
import numpy as np
from datetime import date
import matplotlib.pyplot as plt
pd.options.display.max_rows = 999
Import Credentials for Salesforce Org
import os #use os to navigate to the directory where your config file is stored
cwd = os.getcwd()
os.chdir('/home/ryan/Documents/')
import config #import the config file
sf = Salesforce(username=config.username,
password=config.password,
security_token=config.security_token)
os.chdir(cwd)
List All Objects
Queries for Salesforce Objects are written as Salesforce Object Query Language (SOQL).
desc = sf.describe()
objects = []
obj_labels = [obj['label'] for obj in desc['sobjects']]
obj_names = [obj['name'] for obj in desc['sobjects']]
obj_custom = [obj['custom'] for obj in desc['sobjects']]
for label, name, custom in zip(obj_labels, obj_names, obj_custom):
objects.append((label, name, custom))
objects = pd.DataFrame(objects,
columns = ['label','name','custom'])
print(str('Objects in Org: ' + str(objects.shape[0])))
objects.head(10)
Objects in Org: 451
label | name | custom | |
---|---|---|---|
0 | Accepted Event Relation | AcceptedEventRelation | False |
1 | Account | Account | False |
2 | Account Change Event | AccountChangeEvent | False |
3 | Account Clean Info | AccountCleanInfo | False |
4 | Account Contact Role | AccountContactRole | False |
5 | Account Feed | AccountFeed | False |
6 | Account History | AccountHistory | False |
7 | Account Partner | AccountPartner | False |
8 | Account Share | AccountShare | False |
9 | Action Link Group Template | ActionLinkGroupTemplate | False |
List All Fields
Example below is for Contact.
desc = sf.Contact.describe()
fields = []
field_labels = [field['label'] for field in desc['fields']]
field_names = [field['name'] for field in desc['fields']]
field_types = [field['type'] for field in desc['fields']]
for label, name, dtype in zip(field_labels, field_names, field_types):
fields.append((label, name, dtype))
fields = pd.DataFrame(fields,
columns = ['label','name','type'])
obj = 'Contact'
field_list = ', '.join(list(fields['name']))
query = 'SELECT ' + field_list + '''
FROM ''' + obj
q = sf.query_all(query)
d = (pd.DataFrame(q['records'])
.drop(columns=['attributes']))
print('Records, Fields: ' + str(d.shape))
pd.DataFrame(d.iloc[0].T)
Records, Fields: (20, 61)
0 | |
---|---|
Id | 0034w00004CLCVKAA5 |
IsDeleted | False |
MasterRecordId | None |
AccountId | 0014w00003gxq9xAAA |
LastName | Bond |
FirstName | John |
Salutation | Mr. |
Name | John Bond |
OtherStreet | None |
OtherCity | None |
OtherState | None |
OtherPostalCode | None |
OtherCountry | None |
OtherLatitude | None |
OtherLongitude | None |
OtherGeocodeAccuracy | None |
OtherAddress | None |
MailingStreet | 2334 N. Michigan Avenue, Suite 1500\nChicago, ... |
MailingCity | None |
MailingState | None |
MailingPostalCode | None |
MailingCountry | None |
MailingLatitude | None |
MailingLongitude | None |
MailingGeocodeAccuracy | None |
MailingAddress | {'city': None, 'country': None, 'geocodeAccura... |
Phone | (312) 596-1000 |
Fax | (312) 596-1500 |
MobilePhone | (312) 596-1563 |
HomePhone | None |
OtherPhone | None |
AssistantPhone | None |
ReportsToId | None |
[email protected] | |
Title | VP, Facilities |
Department | Facilities |
AssistantName | None |
LeadSource | External Referral |
Birthdate | 1958-08-02 |
Description | None |
OwnerId | 0054w00000BrZJ6AAN |
CreatedDate | 2022-10-14T11:08:03.000+0000 |
CreatedById | 0054w00000BrZJ6AAN |
LastModifiedDate | 2022-10-14T11:08:03.000+0000 |
LastModifiedById | 0054w00000BrZJ6AAN |
SystemModstamp | 2022-10-14T11:08:03.000+0000 |
LastActivityDate | None |
LastCURequestDate | None |
LastCUUpdateDate | None |
LastViewedDate | None |
LastReferencedDate | None |
EmailBouncedReason | None |
EmailBouncedDate | None |
IsEmailBounced | False |
PhotoUrl | /services/images/photo/0034w00004CLCVKAA5 |
Jigsaw | None |
JigsawContactId | None |
CleanStatus | Pending |
IndividualId | None |
Level__c | Primary |
Languages__c | English |
Calculate Completeness Ratio for Each Field
completeness = (pd.DataFrame(d.isna().sum())
.rename(columns={0:'complete'}))
completeness['complete'] = d.shape[0] - completeness['complete']
completeness['total'] = d.shape[0]
completeness['ratio'] = completeness['complete'].astype(float) / completeness['total'].astype(float)
completeness = completeness.reset_index().rename(columns={'index':'label'})
completeness = completeness[['label','ratio']].copy()
completeness.head()
label | ratio | |
---|---|---|
0 | Id | 1.0 |
1 | IsDeleted | 1.0 |
2 | MasterRecordId | 0.0 |
3 | AccountId | 1.0 |
4 | LastName | 1.0 |
fields['field'] = fields['label'] + ', ' + fields['name'] + ', ' + fields['type']
m = pd.merge(fields, completeness,
left_on=['name'],
right_on=['label'],
how='left',
indicator=True)
print(m._merge.value_counts())
m = m[m['_merge']=='both'].reset_index(drop=True)
m = (m
.drop(columns=['label_x','label_y','name','type','_merge']))
m = (m
.reset_index()
.sort_values('index', # Invert the default sort
ascending=False)
.reset_index(drop=True)
.drop(columns=['index']))
m.head()
both 61
left_only 0
right_only 0
Name: _merge, dtype: int64
field | ratio | |
---|---|---|
0 | Languages, Languages__c, string | 0.85 |
1 | Level, Level__c, picklist | 0.85 |
2 | Individual ID, IndividualId, reference | 0.00 |
3 | Clean Status, CleanStatus, picklist | 1.00 |
4 | Jigsaw Contact ID, JigsawContactId, string | 0.00 |
Visualize
plt.figure(figsize=(8,(d.shape[1]*1/2)), facecolor='white')
plt.barh(m['field'],
m['ratio'],
zorder=3)
plt.title('Data Completeness for ' + obj)
plt.ylim([-1,d.shape[1]])
plt.tick_params(
axis='y',
left=False)
plt.tick_params(
axis='x',
bottom=False)
plt.grid(True,
axis='x',
zorder=0)
for spine in plt.gca().spines.values():
spine.set_visible(False)