from __future__ import print_function import base64 import os.path from bs4 import BeautifulSoup from googleapiclient.discovery import build from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request from google.oauth2.credentials import Credentials # If modifying these scopes, delete the file token.json. SCOPES = ['https://www.googleapis.com/auth/gmail.readonly'] def main(): """Shows basic usage of the Gmail API. Lists the user's Gmail labels. """ creds = None # The file token.json stores the user's access and refresh tokens, and is # created automatically when the authorization flow completes for the first # time. if os.path.exists('token.json'): creds = Credentials.from_authorized_user_file('token.json', SCOPES) # If there are no (valid) credentials available, let the user log in. if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( 'app_credentail.json', SCOPES) creds = flow.run_local_server(port=0) # Save the credentials for the next run with open('token.json', 'w') as token: token.write(creds.to_json()) service = build('gmail', 'v1', credentials=creds) results = service.users().messages().list(userId='me').execute() mails = results.get('messages') for mail in mails: txt = service.users().messages().get(userId='me', id=mail['id']).execute() try: # Get value of 'payload' from dictionary 'txt' payload = txt['payload'] headers = payload['headers'] # Look for Subject and Sender Email in the headers for d in headers: if d['name'] == 'Subject': subject = d['value'] if d['name'] == 'From': sender = d['value'] if (payload.get('parts') is None) or ('data' not in payload.get('parts')['body']): if payload.get('parts') is None: body = "payload.get('parts') is None" else: body = "data key does not exist" else: # The Body of the message is in Encrypted format. So, we have to decode it. # Get the data and decode it with base 64 decoder. parts = payload.get('parts') data = parts['body']['data'] data = data.replace("-", "+").replace("_", "/") decoded_data = base64.b64decode(data) # Now, the data obtained is in lxml. So, we will parse # it with BeautifulSoup library soup = BeautifulSoup(decoded_data, "lxml") body = soup.body() # Printing the subject, sender's email and message print("Subject: ", subject) print("From: ", sender) print("Message: ", body) print('n') except Exception as e: print(e) if __name__ == '__main__': print(main())
This is my python code to read Gmail bodies. There is a local variable called "payload", which is service.users().messages().get(userId=’me’, id=mail[‘id’]).execute()[‘payload’]. This payload variable is supposed to be a dictionary that stores the encoded email bodies. However, sometimes it does not have an email body even though it has the correct headers. This is the case when "(‘data’ not in payload.get(‘parts’)[‘body’])" is true.
Also, sometimes the email body is stored in a different way. Usually, the body exists under the dictionary ‘payload[‘parts’]’ with two different mime-type, one of which is plain-text. Yet, sometimes the ‘parts’ key disappears, and the body is stored. directly in ‘payload’ dictionary. Below is the picture of the payload dictionary for each case:
Usual Case, when both (payload.get(‘parts’) is None) and (‘data’ not in payload.get(‘parts’)[‘body’]) are false:
The case when (payload.get(‘parts’) is None):
The case when (‘data’ not in payload.get(‘parts’)[‘body’]):
I hope to fully understand what is going on, but if it is not possible, I really hope that I can solve at least the last case because it is the only case that has the missing bodies.
Thank you in advance!
Source: Python Questions