Questions about payloads when reading mail bodies with Gmail API (Python)

  beautifulsoup, email, gmail, gmail-api, python
from __future__ import print_function
import base64
import os.path
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials

# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']


def main():
    """Shows basic usage of the Gmail API.
    Lists the user's Gmail labels.
    """
    creds = None
    # The file token.json stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'app_credentail.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    service = build('gmail', 'v1', credentials=creds)

    results = service.users().messages().list(userId='me').execute()
    mails = results.get('messages')
    for mail in mails:
        txt = service.users().messages().get(userId='me', id=mail['id']).execute()
        try:
            # Get value of 'payload' from dictionary 'txt'
            payload = txt['payload']
            headers = payload['headers']

            # Look for Subject and Sender Email in the headers
            for d in headers:
                if d['name'] == 'Subject':
                    subject = d['value']
                if d['name'] == 'From':
                    sender = d['value']


            if (payload.get('parts') is None) or ('data' not in payload.get('parts')[0]['body']):
                if payload.get('parts') is None:
                    body = "payload.get('parts') is None"
                else:
                    body = "data key does not exist"
            else:
                # The Body of the message is in Encrypted format. So, we have to decode it.
                # Get the data and decode it with base 64 decoder.
                parts = payload.get('parts')[0]
                data = parts['body']['data']
                data = data.replace("-", "+").replace("_", "/")
                decoded_data = base64.b64decode(data)

                # Now, the data obtained is in lxml. So, we will parse
                # it with BeautifulSoup library
                soup = BeautifulSoup(decoded_data, "lxml")
                body = soup.body()

            # Printing the subject, sender's email and message
            print("Subject: ", subject)
            print("From: ", sender)
            print("Message: ", body)
            print('n')
        except Exception as e:
            print(e)
            

if __name__ == '__main__':
    print(main())

This is my python code to read Gmail bodies. There is a local variable called "payload", which is service.users().messages().get(userId=’me’, id=mail[‘id’]).execute()[‘payload’]. This payload variable is supposed to be a dictionary that stores the encoded email bodies. However, sometimes it does not have an email body even though it has the correct headers. This is the case when "(‘data’ not in payload.get(‘parts’)[0][‘body’])" is true.
Also, sometimes the email body is stored in a different way. Usually, the body exists under the dictionary ‘payload[‘parts’]’ with two different mime-type, one of which is plain-text. Yet, sometimes the ‘parts’ key disappears, and the body is stored. directly in ‘payload’ dictionary. Below is the picture of the payload dictionary for each case:

Usual Case, when both (payload.get(‘parts’) is None) and (‘data’ not in payload.get(‘parts’)[0][‘body’]) are false:

payload variable when usual case

The case when (payload.get(‘parts’) is None):

payload variable when payload.get(‘parts’) is None

The case when (‘data’ not in payload.get(‘parts’)[0][‘body’]):

payload variable when ‘data’ not in payload.get(‘parts’)[0][‘body’]

I hope to fully understand what is going on, but if it is not possible, I really hope that I can solve at least the last case because it is the only case that has the missing bodies.

Thank you in advance!

Source: Python Questions

LEAVE A COMMENT