😣 ❄️ ♉️ Finding violations in video using computer vision 🔁 🍽️ 👩🏾‍🤝‍👨🏻

Let's assume that this violation is possible. How to identify it?

We have at our disposal records from surveillance cameras of the employee's workplace and a log of operations.

We will look for all the moments on the record where the client was absent. The neural network MobileNet and CSRT Tracker from the opencv library will help us with this. And for convenience, also Tesseract-OCR.

To find a person in the frame, we will use the MobileNet neural network. This network allows you to detect and localize 20 types of objects in the image. For it to work, you need to download two files: architecture and weights. These files can be found in the Github repository .

Before writing the code, we need to install the cv2 computer vision library and the pytesseract package for processing text on images.

!pip install opencv-python
!pip install pytesseract

For pytesseract to work, you must first download the Tesseract-OCR distribution from the official website and install it.

Getting started preparing for video processing

We import the packages and write the path to the Tesseract-OCR folder in the local environment:

import os

video_path = ... #  
tesseract_path = ... #   Tesseract
os.environ["PATH"] += os.pathsep + tesseract_path

import pytesseract
import cv2
import imutils
import pandas as pd
import datetime as dt

, . , / :

df = pd.DataFrame(columns = ['', '  '])
work_place = () #,   
date = None #      
tracked = False #

, . , :

prototxt = 'MobileNetSSD_deploy.prototxt' #
weights = 'MobileNetSSD_deploy.caffemodel' #

20 , :

classNames = {0: 'background',
              1: 'aeroplane',
              2: 'bicycle',
              3: 'bird',
              4: 'boat',
              5: 'bottle',
              6: 'bus',
              7: 'car',
              8: 'cat',
              9: 'chair',
              10: 'cow',
              11: 'diningtable',
              12: 'dog',
              13: 'horse',
              14: 'motorbike',
              15: 'person',
              16: 'pottedplant',
              17: 'sheep',
              18: 'sofa',
              19: 'train',
              20: 'tvmonitor'}

, .

thr = 0.1 #

net = cv2.dnn.readNetFromCaffe(prototxt, weights) #

cv2.VideoCapture, :

cap = cv2.VideoCapture(video_path)

, .read(), . , . . :

%%time

cap = cv2.VideoCapture(video_path)

total_frame = 0
while True:
    success, frame = cap.read()
    if success:
        total_frame += 1
    else:
        break
        

video_length = ... #   
fps = round(total_frame / video_length)
fps

, . 100- 2 .

, , , . , , .

while cap.isOpened():

    ret, frame = cap.read()
    
    if ret:
        
        frame = imutils.resize(frame, width=1200) # ,   


        #  ,    
        if len(work_place) == 0:
            cv2.putText(frame, 'Set the client\'s location', (0, 90), cv2.FONT_HERSHEY_SIMPLEX, 
                2, (0,255,0), 2)
            work_place = cv2.selectROI('frame', frame, fromCenter=False, showCrosshair=True)
            x, y, w, h = [int(coord) for coord in work_place]
            
        # 
        if not date:
            try:
                cv2.putText(frame, 'Set the date, (0, 160), cv2.FONT_HERSHEY_SIMPLEX, 
                    2, (0,255,0), 2)
                date = cv2.selectROI('frame', frame, fromCenter=False, showCrosshair=True)
                date_x, date_y, date_w, date_h = [int(coord) for coord in date]
                date_ = frame[date_y : date_y+date_h, date_x : date_x+date_w]
                date_ = cv2.cvtColor(date_, cv2.COLOR_BGR2GRAY) #   
                #date_ = cv2.threshold(date_, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
                date_ = cv2.threshold(date_, 180, 255, 0)[1] #     
                date = pytesseract.image_to_string(date_)
                date = dt.datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                
            except:
                print('   ,      -- ::')
                date_ = input()
                date = dt.datetime.strptime(date_, '%Y-%m-%d %H:%M:%S')
                

        if cap.get(1) % fps == 0:
            date += dt.timedelta(seconds = 1)
        
        if not tracked or (cap.get(1) % (fps * 30) == 0):

            # 
            frame_resized = cv2.resize(frame, (300, 300)) #   300  300 
            blob = cv2.dnn.blobFromImage(frame_resized, 0.007843, 
                                         (300,300), (127.5, 127.5, 127.5), False)

            #    
            net.setInput(blob)
            detections = net.forward() 
            #[0, 0, object, [0, class_id, confidence, xLeftBottom, yLeftBottom, xRightTop, yRightTop]]

            #   
            cols = frame_resized.shape[1]
            rows = frame_resized.shape[0]

            #       
            for obj in detections[0,0, :, :]:
                confidence = obj[2]
                if confidence > thr:

                    class_id = int(obj[1])
                    if class_id == 15:

                        xLeftBottom = int(obj[3] * cols)
                        yLeftBottom = int(obj[4] * rows)
                        xRightTop   = int(obj[5] * cols)
                        yRightTop   = int(obj[6] * rows)

                        #     
                        heightFactor = frame.shape[0] / 300.0
                        widthFactor = frame.shape[1] / 300.0

                        #    
                        xLeftBottom = int(widthFactor * xLeftBottom)
                        yLeftBottom = int(heightFactor * yLeftBottom)
                        xRightTop   = int(widthFactor * xRightTop)
                        yRightTop   = int(heightFactor * yRightTop)

                        #    
                        xCenter = xLeftBottom + (xRightTop - xLeftBottom)/2
                        yCenter = yLeftBottom + (yRightTop - yLeftBottom)/2

                        #     
                        if xCenter < x + w and yCenter < y + h and xCenter > x and yCenter > y:
                            tracker = cv2.TrackerCSRT_create()
                            tracker.init(frame, (xLeftBottom, yLeftBottom, xRightTop-xLeftBottom, yRightTop-yLeftBottom))
                            tracked = True
                            cv2.rectangle(frame, (xLeftBottom,yLeftBottom), (xRightTop,yRightTop), (0,255,0), 3, 1)
                            break
                        else:
                            tracked = False
        else:
            _, bbox = tracker.update(frame)
            X, Y, W, H = [int(coord) for coord in bbox]

            xCenter = X + W/2
            yCenter = Y + H/2
            
            if xCenter < x + w and yCenter < y + h and xCenter > x and yCenter > y:
                
                tracked = True
                cv2.rectangle(frame, (X,Y), (X + W, Y + H), (255,255,0), 3, 1)
            else:
                tracked = False

        cv2.imshow('frame', frame)
        df.loc[cap.get(1), :] = [date, tracked]
        print(cap.get(1), date, tracked) #  ,   / 
        if cv2.waitKey(1) == 27: #ESC
            break
    else:
        break

cap.release()
cv2.destroyAllWindows()

.read() : , , – . , , , .

. , «-- ::». , : , . date

.

Tesseract-

, . , , .

.get() 1 , , fps

, date

. tesseract, , .

: cv2.resize() cv2.dnn.blobFromImage(). , . , detections

. , 20 .

, 15. , , tracked

True . tracked

date

df

.

, . , tracked

True False, .

. , , . .

, / . , , . , .

df_ = df.groupby('', as_index=False).agg(max)
df_.to_excel('output.xlsx', index=False)

, :

. , . opencv. , , .
. .
. , , . «» , .

The first and second problem can be solved by trackers based on deep learning. For example, a tracker GOTURN

. This tracker is implemented in the library opencv

, but for its operation you need to download additional files. You can also use the popular tracker Re3

or the recently introduced tracker AcurusTrack

. The third problem can be solved by replacing the neural network and / or retraining it on seated people.

Link to code .

Finding violations in video using computer vision

More articles: