Merge branch 'E02-US02-extracao-chevrolet' into 'develop'

[E02-US02] Task 11 - Extração Tabela Modelos See merge request veiculos-via-montadora/backend!6

Merge branch 'E02-US02-extracao-chevrolet' into 'develop'
[E02-US02] Task 11 - Extração Tabela Modelos See merge request veiculos-via-montadora/backend!6
55046fd4 · Arthur Sudbrack Ibarra · 71737177 · 9d4f6a9f · 55046fd4 · 55046fd4
Commit 55046fd4 authored 1 year ago by Arthur Sudbrack Ibarra
Hide whitespace changes
Inline Side-by-side

Showing

with 512 additions and 18 deletions
+512 -18
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -23,7 +23,6 @@ variables:
  # DB_ENVIRONMENT is set to test to run the tests in the pipeline.
  DB_ENVIRONMENT: test

-
 # Stages.
 # The stages are used to organize the jobs.
 # They run in the order they are defined.
@@ -42,6 +41,9 @@ test:
  stage: test
  image: python:3.10.11-bullseye
  script:
+    # Install Java 11 (Needed to use Tabula-Py).
+    - echo "[Installing Java 11 to use Tabula-Py]"
+    - apt update && apt install -y openjdk-11-jre openjdk-11-jdk
    # Install Dependencies.
    - echo "[Installing Poetry]"
    - pip install "poetry==1.4.1"

--- a/Dockerfile
+++ b/Dockerfile
-# Using the official Python image.
-# Python 3.8.0.
+# Using the official Python 3.10.11 image for Debian Bullseye.
 FROM python:3.10.11-bullseye

 # Arguments for the build.
@@ -7,7 +6,8 @@ FROM python:3.10.11-bullseye
 # ENVIRONMENT: development, production.
 # Default: development.
 #
-ARG ENVIRONMENT=development
+# MONGODB_HOST: MongoDB connection string.
+ARG ENVIRONMENT="development"

 # Environment variables.
 ENV ENVIRONMENT=${ENVIRONMENT} \
@@ -17,12 +17,14 @@ ENV ENVIRONMENT=${ENVIRONMENT} \
    PIP_NO_CACHE_DIR=off \
    PIP_DISABLE_PIP_VERSION_CHECK=on \
    PIP_DEFAULT_TIMEOUT=100 \
-    POETRY_VERSION=1.4.1 \
-    MONGODB_HOST=<SECRET>
+    POETRY_VERSION=1.4.1

 # Set the working directory.
 WORKDIR /app

+# Install Java 11 (Needed to use Tabula-Py).
+RUN apt update && apt install -y openjdk-11-jre openjdk-11-jdk
+
 # Install Poetry.
 RUN pip install "poetry==$POETRY_VERSION"


--- a/app/api/models/veiculo.py
+++ b/app/api/models/veiculo.py
@@ -33,4 +33,6 @@ class Veiculo(BaseModel):
    num_portas: str = ""
    num_renavam: str = ""
    status: str = "PENDENTE"
+    producao: str = ""
+    desc_vendas: str = ""
    pdf_names: List[str] = []
--- a/app/api/routers/veiculo_router.py
+++ b/app/api/routers/veiculo_router.py
-from fastapi import APIRouter, File, UploadFile
+from fastapi import APIRouter, File, Form, UploadFile
 from app.api.models.veiculo import Veiculo
 from app.database.mongo import get_database
 from app.api.repositories.veiculo_repository import VeiculoRepository
@@ -7,7 +7,8 @@ from app.api.services.veiculo_service import VeiculoService
 # Car router.
 #
 # Here we define the routes for the car resource.
-# We call the CarController to handle the requests.
+# This router also acts as the controller for the car resource.
+# It receives the requests, calls the service and returns the response.

 # These variables start with an underscore to indicate that they are 'private'.
 # They are not meant to be used outside of this file.
@@ -17,6 +18,8 @@ _veiculo_service = VeiculoService(_repository)
 _veiculo_router = APIRouter(prefix="/veiculos")


+## Routes - START ##
+
 @_veiculo_router.get("/")
 def get_veiculos() -> list[Veiculo]:
    return _veiculo_service.get_all()
@@ -42,17 +45,14 @@ def delete_veiculo(sigla: str) -> str:
    return _veiculo_service.delete(sigla)


-# It contains a single endpoint that receives the PDF file.
 @_veiculo_router.post("/upload/pdf")
-def create_upload_file(form_data: UploadFile = File(...)):
-    contents = form_data.file.read()  # This function reads the pdf bytes.
-    # Here we have the pdf bytes saved in the application memory.
-    # The ideia is to call a funtion which will handle the pdf bytes and extract them.
+def create_veiculo_by_pdf(file: UploadFile = File(...), montadora: str = Form(...)):
+    pdf_bytes = file.file.read()
+    file_name = file.filename
+    return _veiculo_service.create_by_pdf(file_name, pdf_bytes, montadora)

-    # This is the file name in memory. It will be used to save the veiculo JSON in the database.
-    name = form_data.filename

-    return {"filename": name}
+## Routes - END ##


 # This function is used to get the car router.

--- a/app/api/services/veiculo_service.py
+++ b/app/api/services/veiculo_service.py
+from io import BytesIO
 from fastapi import HTTPException
 from app.api.repositories.veiculo_repository import VeiculoRepository
 from app.api.models.veiculo import Veiculo
+from app.pdf.chevrolet.ChevroletPDFReader import ChevroletPDFReader


+CHEVROLET_MONTADORA = "chevrolet"
+
 # Service class.
 #
 # This class is responsible for handling the business logic.
 # It will call the repository layer to get the data and return the response.
 # It will also handle any exceptions that are raised by the repository layer.
+
+
 class VeiculoService:
    def __init__(self, repository: VeiculoRepository):
        self._repository = repository
@@ -39,3 +45,92 @@ class VeiculoService:
            raise HTTPException(
                status_code=400, detail="Dado nao encontrado para deletar.")
        return sigla
+
+    # This function will create a Veiculo object using the data read from a PDF file.
+    # It redirects the call to the correct function based on the 'montadora' parameter.
+    def create_by_pdf(self, file_name: str, pdf_bytes: bytes, montadora: str) -> Veiculo:
+        if str.lower(montadora) == CHEVROLET_MONTADORA:
+            return self._create_by_pdf_chevrolet(file_name, pdf_bytes)
+        else:
+            raise HTTPException(
+                status_code=400, detail="Montadora inválida.")
+
+    # This function will create a Veiculo object using the data read from a PDF file.
+    # It is specific for Chevrolet PDFs.
+    def _create_by_pdf_chevrolet(self, file_name: str, pdf_bytes: bytes) -> list[Veiculo]:
+        bytes_io = BytesIO(pdf_bytes)
+
+        # PDFs from 2023 tend to only work with the lattice mode on,
+        # while PDFs from 2022 tend to only work with the lattice mode off.
+        # This is for sure not a rule, it is what has been observed MOST of the time.
+        # This is also a TERRIBLE way to do this, but there is no good way to do this.
+        # Each PDF has its own way of being read, no patterns, and there is no way to know beforehand.
+        #
+        # From tabula-py documentation:
+        #
+        # "lattice: Force PDF to be extracted using lattice-mode extraction
+        # (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
+        lattice = True if '2023' in file_name else False
+        pdf_reader = ChevroletPDFReader(bytes_io, lattice=lattice)
+
+        # DEBUG: Print the PDF tables.
+        # pdf_reader.print_tables()
+
+        # Here we are reading the data from the first tables in the PDF.
+        vehicles_data = []
+        table_group = ChevroletPDFReader.INTRODUCTION_GROUP
+        for i in range(pdf_reader.get_tables_count(table_group)):
+            for j in range(pdf_reader.get_lines_count(table_group, i)):
+                line_data = pdf_reader.get_line_values(table_group, i, j, {
+                    # Data of the column with name 85% similar to 'CÓDIGO VENDAS' will be stored in the 'codigo_vendas' key.
+                    ("CÓDIGO VENDAS", 85): "sigla",
+                    ("DESCRIÇÃO VENDAS", 85): "desc_vendas",
+                    ("MARCA/MODELO", 50): "num_renavam",
+                    ("DESCRIÇÃO NO CAT", 85): "desc_cat",
+                    ("PRODUÇÃO", 85): "producao",
+                })
+                vehicles_data.append(line_data)
+
+        # List of vehicles to be returned by the API.
+        vehicles = []
+        for vehicle_dict in vehicles_data:
+            # First we try to find the vehicle in the database using the 'sigla' as the search key.
+            sigla = vehicle_dict["sigla"]
+            vehicle_found = None
+            try:
+                vehicle_found = self.get_by_sigla(sigla)
+                # Ok, found!
+                # We will update the vehicle below.
+            except:
+                # Not found, no problem.
+                # We will create the vehicle below.
+                pass
+
+            # Setting the data from the PDF to the Veiculo object.
+            desc_cat = vehicle_dict["desc_cat"]
+            num_renavam = vehicle_dict["num_renavam"]
+            producao = vehicle_dict["producao"]
+            desc_vendas = vehicle_dict["desc_vendas"]
+            vehicle = Veiculo(sigla=sigla, desc_cat=desc_cat,
+                              num_renavam=num_renavam, producao=producao, desc_vendas=desc_vendas)
+            vehicle.status = "PROCESSADO"
+
+            # If the vehicle was not found, we create a new one.
+            # If the vehicle was found, we update it.
+            if vehicle_found is None:
+                vehicle.pdf_names = [file_name]
+                new_vehicle = self.create(vehicle)
+                vehicles.append(new_vehicle)
+            else:
+                vehicle.pdf_names = vehicle_found.pdf_names + [file_name]
+                updated_vehicle = self.update(sigla, vehicle)
+                vehicles.append(updated_vehicle)
+
+        # Returning the list of vehicles.
+        return vehicles
+
+    # This function will create a Veiculo object using the data read from a PDF file.
+    # It is specific for Jeep PDFs.
+    # (NOT IMPLEMENTED YET)
+    def _create_by_pdf_jeep(self, file_name: str, pdf_bytes: bytes) -> Veiculo:
+        pass
--- a/app/pdf/chevrolet/ChevroletPDFReader.py
+++ b/app/pdf/chevrolet/ChevroletPDFReader.py
+import tabula
+from fuzzywuzzy import fuzz
+from pandas import DataFrame
+from typing import Union, List, Dict, Tuple
+from io import BytesIO
+from app.pdf.exceptions import *
+from os import name as os_name
+
+
+# This class reads a PDF file and separates the tables into groups.
+# The tables are stored in a dictionary where the key is the table group name.
+# The value is a list of dataframes.
+#
+# The table groups are:
+# Introduction
+# Configuration
+# Specification
+# Accessories
+# Unknown
+#
+# This class offers methods so that manipulations with tabula-py are abstracted away.
+class ChevroletPDFReader:
+    # Possible table groups.
+    INTRODUCTION_GROUP = 'Introduction'
+    CONFIGURATION_GROUP = 'Configuration'
+    CONFIGURATION_GROUP_2 = 'Configuration 2'
+    SPECIFICATION_GROUP = 'Specification'
+    ACCESSORIES_1_GROUP = 'Accessories'
+    ACCESSORIES_2_GROUP = 'Accessories 2'
+    UNKNOWN_GROUP = 'Unknown'
+
+    # Specification table group and unknown group are not added here because they are special cases.
+    _TABLE_GROUP_NAMES = [INTRODUCTION_GROUP, CONFIGURATION_GROUP, CONFIGURATION_GROUP_2,
+                          ACCESSORIES_1_GROUP, ACCESSORIES_2_GROUP]
+
+    def __init__(self, pdf_bytes: BytesIO, lattice: bool = True, fuzzy_matching_ratio_threshold: int = 75):
+        # Check the OS, if it is Windows, then use ANSI encoding.
+        # Otherwise, use UTF-8.
+        encoding = 'ANSI' if os_name == 'nt' else 'utf-8'
+        # Read all tables from the PDF file.
+        dataframes = tabula.read_pdf(
+            pdf_bytes, pages='all', lattice=lattice, multiple_tables=True, encoding=encoding)
+        # Set the fuzzy matching ratio threshold.
+        # This is used to match column names and line names.
+        self._fuzzy_matching_ratio_threshold = fuzzy_matching_ratio_threshold
+        # Call the initial setup method.
+        self._initial_setup(dataframes)
+
+    # This method separates the tables into groups and sanitizes the dataframes.
+    def _initial_setup(self, dataframes: List[DataFrame]) -> None:
+        # Map of tables by group.
+        self._tables_by_group: Dict[str, List[DataFrame]] = {}
+        # Initialize the map with empty lists.
+        for table_group in self._TABLE_GROUP_NAMES:
+            self._tables_by_group[table_group] = []
+        self._tables_by_group[ChevroletPDFReader.SPECIFICATION_GROUP] = []
+        self._tables_by_group[ChevroletPDFReader.UNKNOWN_GROUP] = []
+        # Variable to keep track of the current table group.
+        current_table_group_index = 0
+        # Removing new lines from column names and removing unnamed columns.
+        # Also removing empty dataframes.
+        for dataframe in dataframes:
+            table_group = self._TABLE_GROUP_NAMES[current_table_group_index] if current_table_group_index < len(
+                self._TABLE_GROUP_NAMES) else ChevroletPDFReader.UNKNOWN_GROUP
+            # Don't consider empty tables.
+            if dataframe.empty or len(dataframe.columns) == 0:
+                continue
+            for column in dataframe.columns:
+                # Transform all column data to string to avoid errors found.
+                dataframe[column] = dataframe[column].astype(str)
+                # Remove new lines from column names.
+                dataframe.rename(
+                    columns={column: column.replace('\r', ' ')}, inplace=True)
+                # Remove unnamed columns.
+                if 'unnamed' in str.lower(column):
+                    del dataframe[column]
+            # Don't consider tables with only one column.
+            # Unless it is a technical specifications table.
+            if len(dataframe.columns) == 1:
+                # Check if it is a technical specifications table.
+                # (this table is weird and must be handled separately).
+                # Get the only column name.
+                column_name = dataframe.columns[0]
+                # Check if the column name is similar to "Especificações Técnicas".
+                ratio = fuzz.ratio(str.lower(column_name),
+                                   'especificações técnicas')
+                if ratio < 50:
+                    continue
+                # If it is, then it is a technical specifications table.
+                self._tables_by_group[ChevroletPDFReader.SPECIFICATION_GROUP].append(
+                    dataframe)
+                continue
+            # Is the current table of the same group as the previous one?
+            # Or is it a completely new group?
+            if len(self._tables_by_group[table_group]) == 0:
+                self._tables_by_group[table_group].append(dataframe)
+            else:
+                # Check if the current table is of the same group as the previous one.
+                # To do that, check if the number of columns is the same and if the columns are the same.
+                previous_table = self._tables_by_group[table_group][-1]
+                if len(previous_table.columns) != len(dataframe.columns):
+                    current_table_group_index += 1
+                else:
+                    # Check if all columns are the same.
+                    same_columns = True
+                    for column in previous_table.columns:
+                        if column not in dataframe.columns:
+                            same_columns = False
+                            break
+                    if not same_columns:
+                        # Increment the current table group index.
+                        # This is to change the current table group from now on.
+                        current_table_group_index += 1
+                # Append the table to the current table group.
+                table_group = self._TABLE_GROUP_NAMES[current_table_group_index] if current_table_group_index < len(
+                    self._TABLE_GROUP_NAMES) else ChevroletPDFReader.UNKNOWN_GROUP
+                self._tables_by_group[table_group].append(dataframe)
+
+    # This is a very complex method, which will be explained below:
+    #
+    # This method basically returns the value of a cell in a table.
+    # However, this method is very flexible and can be used in many ways:
+    #
+    # Two parameters are always the same:
+    #
+    # table_group: str => The table group name.
+    # table_index: int => The table index (position) in the table group.
+    #
+    # The last two parameters can be used in two ways:
+    #
+    # column_index_or_name: int OR str => The column index or the column name.
+    # line_number_or_name: int OR Tuple[int, str] => The line number or a tuple (column_number, line_name).
+    #
+    # The column_index_or_name parameter can be an integer or a string. If it is an integer, it will be used as the
+    # column index. If it is a string, it will be used as the column name. Column names are matched using fuzzywuzzy
+    # (fuzzy string matching).
+    #
+    # The line_number_or_name parameter can be an integer or a tuple (column_number, line_name). If it is an integer,
+    # it will be used as the line number. If it is a tuple, it will be used as a tuple (column_number, line_name).
+    # Line names are also matched using fuzzywuzzy.
+    def get_cell_value(self,
+                       table_group: str,
+                       table_index: int,
+                       column_index_or_name: Union[int, str],
+                       line_number_or_name: Union[int, Tuple[int, str]]) -> str:
+        # Raise an exception if the table group is not found.
+        if table_group not in self._tables_by_group:
+            raise TableGroupNotFoundException(
+                f'Table group "{table_group}" not found.')
+        # Raise an exception if the table index is out of range.
+        if table_index >= len(self._tables_by_group[table_group]):
+            raise TableIndexOutOfBoundsException(
+                f'Table index "{table_index}" out of range for table group "{table_group}".')
+        table = self._tables_by_group[table_group][table_index]
+        # Which column to use?
+        # Should we use the column name or the column index?
+        # It depends on the type of the column_index_or_name parameter (int or str).
+        column_index = -1
+        # If using column name...
+        if isinstance(column_index_or_name, str):
+            # Using fuzzywuzzy to find the most similar column name.
+            most_similar_column_index = -1
+            most_similar_column_ratio = -1
+            for index, name in enumerate(table.columns):
+                ratio = fuzz.ratio(
+                    str.lower(name), str.lower(column_index_or_name))
+                if ratio >= self._fuzzy_matching_ratio_threshold and ratio > most_similar_column_ratio:
+                    most_similar_column_index = index
+                    most_similar_column_ratio = ratio
+            column_index = most_similar_column_index
+        # If using column index...
+        elif isinstance(column_index_or_name, int):
+            column_index = column_index_or_name
+        else:
+            # Raise an exception if the column index or name is of an invalid type.
+            raise InvalidArgumentTypeException(
+                f'Invalid argument type for column_index_or_name: {type(column_index_or_name)}. Expected int or str.')
+        # Ok, now we have the column index.
+        # Check if column index is still -1.
+        # If so, that means that the column name was not found.
+        if column_index == -1:
+            raise ColumnNotFoundException(
+                f'Column with name "{column_index_or_name}" or similar not found for table group "{table_group}" and table index "{table_index}".')
+        # Check if the column index is out of range.
+        if column_index >= len(table.columns):
+            raise ColumnIndexOutOfBoundsException(
+                f'Column index "{column_index}" out of range for table group "{table_group}", table index "{table_index}" and column index "{column_index_or_name}".')
+        # Now, should we use the line number or a tuple (column_number, line_name)?
+        # It depends on the type of the line_number_or_name parameter (int or Tuple[int, str]).
+        # If using line number...
+        if isinstance(line_number_or_name, int):
+            # Raise an exception if the line number is out of range.
+            if line_number_or_name >= len(table):
+                raise LineIndexOutOfBoundsException(
+                    f'Line index "{line_number_or_name}" out of range for table group "{table_group}" and table index "{table_index}".')
+            return table.iloc[line_number_or_name, column_index]
+        # If using tuple (column_number, line_name)...
+        elif isinstance(line_number_or_name, Tuple):
+            # Using fuzzywuzzy to find the most similar line name.
+            most_similar_line_index = -1
+            most_similar_line_ratio = -1
+            column_number = line_number_or_name[0]
+            line_name = line_number_or_name[1]
+            for index, name in enumerate(table.iloc[:, column_number]):
+                ratio = fuzz.ratio(str.lower(name), str.lower(line_name))
+                if ratio >= self._fuzzy_matching_ratio_threshold and ratio > most_similar_line_ratio:
+                    most_similar_line_index = index
+                    most_similar_line_ratio = ratio
+            # Raise an exception if the line name was not found.
+            if most_similar_line_ratio == -1:
+                raise LineNotFoundException(
+                    f'Line with name "{line_name}" or similar not found for table group "{table_group}", table index "{table_index}" and column index "{column_index}".')
+            return table.iloc[most_similar_line_index, column_index]
+        else:
+            # Raise an exception if the line number or name is of an invalid type.
+            raise InvalidArgumentTypeException(
+                f'Invalid argument type for line_number_or_name: {type(line_number_or_name)}. Expected int or tuple (column_number, line_name).')
+
+    # This function will return all the data from one line of a table.
+    # This method also needs the table group name and the table index.
+    #
+    # The line number is the line index (position) in the table.
+    #
+    # The column_similarity_rules is an optional parameter that can be used to mold how
+    # the dictionary data will be returned:
+    #
+    # Example:
+    #
+    # column_similarity_rules = {
+    #     ('CODIGO DE VENDAS', 80): 'cod_vendas', # Using fuzzy matching with a ratio of 80.
+    #     ('MARCA/MODELO', 85): 'brand', # Using fuzzy matching with a ratio of 85.
+    # }
+    #
+    # Through string fuzzy matching, the column names will be compared to the keys of the
+    # column_similarity_rules dictionary. When matches are found, the value of the dictionary
+    # will be used as the key of the returned dictionary, as below:
+    #
+    # {
+    #     'cod_vendas': '123456',
+    #     'brand': 'FORD FOCUS',
+    # }
+    def get_line_values(self,
+                        table_group: str,
+                        table_index: int,
+                        line_number: int,
+                        column_similarity_rules: Union[Dict[Tuple[str, int], str], None] = None) -> Dict[str, str]:
+        # Raise an exception if the table group is not found.
+        if table_group not in self._tables_by_group:
+            raise TableGroupNotFoundException(
+                f'Table group "{table_group}" not found.')
+        # Raise an exception if the table index is out of range.
+        if table_index >= len(self._tables_by_group[table_group]):
+            raise TableIndexOutOfBoundsException(
+                f'Table index "{table_index}" out of range for table group "{table_group}".')
+        table = self._tables_by_group[table_group][table_index]
+        # Raise an exception if the line number is out of range.
+        if line_number >= len(table):
+            raise LineIndexOutOfBoundsException(
+                f'Line index "{line_number}" out of range for table group "{table_group}" and table index "{table_index}".')
+        line_data = {}
+        for column_name, column_value in table.iloc[line_number].items():
+            if column_similarity_rules is not None:
+                # If using column similarity rules, use the rule key as the key of the returned dictionary.
+                for column_name_ratio_tuple_rule, column_key_rule in column_similarity_rules.items():
+                    column_name_rule = column_name_ratio_tuple_rule[0]
+                    column_ratio_rule = column_name_ratio_tuple_rule[1]
+                    ratio = fuzz.ratio(
+                        str.lower(column_name), str.lower(column_name_rule))
+                    if ratio >= column_ratio_rule:
+                        line_data[column_key_rule] = column_value
+                        break
+            else:
+                # If not using column similarity rules, just use the column name as the key.
+                line_data[column_name] = column_value
+        return line_data
+
+    # This function returns the number of tables in a table group.
+    def get_tables_count(self, table_group: str) -> int:
+        # Raise an exception if the table group is not found.
+        if table_group not in self._tables_by_group:
+            raise TableGroupNotFoundException(
+                f'Table group "{table_group}" not found.')
+        return len(self._tables_by_group[table_group])
+
+    # This function returns the number of columns in a table.
+    def get_columns_count(self, table_group: str, table_index: int) -> int:
+        # Raise an exception if the table group is not found.
+        if table_group not in self._tables_by_group:
+            raise TableGroupNotFoundException(
+                f'Table group "{table_group}" not found.')
+        # Raise an exception if the table index is out of range.
+        if table_index >= len(self._tables_by_group[table_group]):
+            raise TableIndexOutOfBoundsException(
+                f'Table index "{table_index}" out of range for table group "{table_group}".')
+        return len(self._tables_by_group[table_group][table_index].columns)
+
+    # This function returns the number of lines in a table.
+    def get_lines_count(self, table_group: str, table_index: int) -> int:
+        # Raise an exception if the table group is not found.
+        if table_group not in self._tables_by_group:
+            raise TableGroupNotFoundException(
+                f'Table group "{table_group}" not found.')
+        # Raise an exception if the table index is out of range.
+        if table_index >= len(self._tables_by_group[table_group]):
+            raise TableIndexOutOfBoundsException(
+                f'Table index "{table_index}" out of range for table group "{table_group}".')
+        return len(self._tables_by_group[table_group][table_index])
+
+    # Only for debugging purposes.
+    # Prints all the tables in the reader.
+    def print_tables(self):
+        for group_name, tables in self._tables_by_group.items():
+            print(f"Group: {group_name}")
+            for index, table in enumerate(tables):
+                print(f"Table {index}:")
+                print(table)
+                print('')
+
+
+# ======================================================================================================================
+# Demo usage, uncomment to test.
+
+# # Instantiate the reader.
+# reader = ChevroletPDFReader('carros.pdf')
+
+# # Example 1: Get the value of the cell in the 'Marca/Modelo' column in the first line of the first table in the
+# # 'Introduction' group.
+# value_1 = reader.get_cell_value(
+#     ChevroletPDFReader.INTRODUCTION_GROUP, 0, 'Marca/Modelo', 0)
+# print(f"Value 1: {value_1}")
+
+# # Example 2: Get the value of the cell in the first column, in the first line of the second table in the
+# # 'Introduction' group.
+# value_2 = reader.get_cell_value(
+#     ChevroletPDFReader.INTRODUCTION_GROUP, 1, 0, 0)
+# print(f"Value 2: {value_2}")
+
+# value_3 = reader.get_cell_value(
+#     ChevroletPDFReader.CONFIGURATION_GROUP, 0, 0, 0)
+# print(f"Value 3: {value_3}")
+
+# # Using the first configuration table, check if the 'LT Turbo 116cv' configuration has the 'Brake Light' option.
+# # This is done by checking if the value of the cell in the 'LT Turbo 116cv' column in the 'Brake Light' line is
+# # a X or not.
+# value_4 = reader.get_cell_value(
+#     ChevroletPDFReader.CONFIGURATION_GROUP, 0, 'LT Turbo 116cv', (0, 'Brake Light'))
+# print(f"Value 4: {value_4}")
--- a/app/pdf/exceptions.py
+++ b/app/pdf/exceptions.py
+# Custom exceptions classes - START.
+
+class TableGroupNotFoundException(Exception):
+    pass
+
+
+class TableIndexOutOfBoundsException(Exception):
+    pass
+
+
+class ColumnIndexOutOfBoundsException(Exception):
+    pass
+
+
+class LineIndexOutOfBoundsException(Exception):
+    pass
+
+
+class ColumnNotFoundException(Exception):
+    pass
+
+
+class LineNotFoundException(Exception):
+    pass
+
+
+class InvalidArgumentTypeException(Exception):
+    pass
+
+# Custom exceptions classes - END.
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -24,7 +24,7 @@ services:
      - ENVIRONMENT=production
    container_name: veiculos-via-montadora-api
    environment:
-      - MONGODB_HOST=<SECRET>
+      - MONGODB_HOST=mongodb://mongo_user:mongo_password@host.docker.internal:27017
    ports:
      - "8000:8000"
    depends_on:

--- a/poetry.lock
+++ b/poetry.lock
@@ -165,6 +165,21 @@ dev = ["pre-commit (>=2.17.0,<3.0.0)", "ruff (==0.0.138)", "uvicorn[standard] (>
 doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-markdownextradata-plugin (>=0.1.7,<0.3.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pyyaml (>=5.3.1,<7.0.0)", "typer-cli (>=0.0.13,<0.0.14)", "typer[all] (>=0.6.1,<0.8.0)"]
 test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==23.1.0)", "coverage[toml] (>=6.5.0,<8.0)", "databases[sqlite] (>=0.3.2,<0.7.0)", "email-validator (>=1.1.1,<2.0.0)", "flask (>=1.1.2,<3.0.0)", "httpx (>=0.23.0,<0.24.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.982)", "orjson (>=3.2.1,<4.0.0)", "passlib[bcrypt] (>=1.7.2,<2.0.0)", "peewee (>=3.13.3,<4.0.0)", "pytest (>=7.1.3,<8.0.0)", "python-jose[cryptography] (>=3.3.0,<4.0.0)", "python-multipart (>=0.0.5,<0.0.7)", "pyyaml (>=5.3.1,<7.0.0)", "ruff (==0.0.138)", "sqlalchemy (>=1.3.18,<1.4.43)", "types-orjson (==3.6.2)", "types-ujson (==5.7.0.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,<6.0.0)"]

+[[package]]
+name = "fuzzywuzzy"
+version = "0.18.0"
+description = "Fuzzy string matching in python"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "fuzzywuzzy-0.18.0-py2.py3-none-any.whl", hash = "sha256:928244b28db720d1e0ee7587acf660ea49d7e4c632569cad4f1cd7e68a5f0993"},
+    {file = "fuzzywuzzy-0.18.0.tar.gz", hash = "sha256:45016e92264780e58972dca1b3d939ac864b78437422beecebb3095f8efd00e8"},
+]
+
+[package.extras]
+speedup = ["python-levenshtein (>=0.12)"]
+
 [[package]]
 name = "h11"
 version = "0.14.0"
@@ -759,4 +774,4 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)",
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9.0"
-content-hash = "89e974075321546e7841055b5ac967f55304124a2276439cc602e7252ade0f3f"
+content-hash = "18d643ac7f84e4997e115ecea512929d8bb392c08232ba092661377077fe82db"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ pymongo = "4.3.3"
 tabula-py = "2.7.0"
 httpx = "0.23.3"
 python-multipart = "0.0.6"
+fuzzywuzzy = "0.18.0"

 [tool.poetry.group.dev.dependencies]
 autopep8 = "2.0.2"