Python multiprocessing ending with zombies

  multiprocessing, multithreading, performance, python

36 000 targets are to be processed on an ubuntu. After 13-14 hours of calculation and 5814 targets, the number of processes (initally 120) drops and processes are becoming zombies.

I implemented Multiprocessing like this :

from multiprocessing import Process
import gc
import traceback

from scrapy.crawler import CrawlerProcess

from scrapy.settings import Settings
crawler_settings = Settings()
crawler_settings.setmodule(my_settings)

from scrapy.spiders.sales import SalesSpider

def format_target(seller):
    return f"xxxxxxxxxxx{seller}xxxxxxxxxxxxxx"

def launch_crawler(crawler, seller):
    try:
        formated_seller = format_target(seller[1])
        if formated_seller:
            process = CrawlerProcess(crawler_settings)
            process.crawl(crawler, seller[0], formated_seller, seller[2])
            process.start(stop_after_crawl=True)
            del formated_seller
            del process
    except:
        print(traceback.format_exc())

def process_x(urls_lst, process_nb):

    list_process = [None] * process_nb
    while urls_lst:
        for i in range(process_nb):
            if not (list_process[i] and list_process[i].is_alive()):
                list_process[i] = Process(target=launch_crawler, args=(SalesSpider, urls_lst.pop(0)))
                list_process[i].start()
                gc.collect()
                break

    ## Wait all thread end
    for process in list_process:
        if process:
            process.join()
            gc.collect()

## MAIN
sellers = [...] ## 36k objects
process_x(sellers,120)

This is the first time it happens with this implementation. I’ve already make it run over multiple days and going to end full without any problem.

How to prevent this ?

Source: Python Questions

LEAVE A COMMENT