BRE/webapi/portal/data_reader.py

197 lines
6.2 KiB
Python
Raw Normal View History

2024-06-07 19:50:21 +03:00
'''Reading data from Excel spreadsheets'''
from enum import IntEnum, unique
from datetime import datetime
import pandas
from .info_models import FieldType, InputMethod, text_to_method
_ContentFields = [
FieldType.task_type,
FieldType.status,
FieldType.content_name,
FieldType.skip,
FieldType.change_score,
FieldType.biblio_name,
FieldType.definition,
FieldType.is_immutable,
FieldType.object_type,
FieldType.markers,
FieldType.tags,
FieldType.author,
FieldType.supervisor,
FieldType.executor,
FieldType.task_manager,
FieldType.responsible,
FieldType.department,
FieldType.date_target,
FieldType.source,
FieldType.electron_bre,
FieldType.main_page,
FieldType.is_general,
FieldType.actualize_period,
FieldType.age_restriction,
FieldType.priority,
FieldType.article_type,
FieldType.date_exchange,
FieldType.date_ees1,
FieldType.date_ex_tools,
FieldType.date_ees2,
FieldType.expert,
FieldType.contract,
FieldType.comment,
FieldType.task_id,
FieldType.content_name_db,
FieldType.task_name
]
@unique
class _ContentColumns(IntEnum):
task_type = 0
status = 1
content_name = 2
change_score = 4
biblio_name = 5
definition = 6
is_immutable = 7
object_type = 8
markers = 9
tags = 10
author = 11
supervisor = 12
executor = 13
task_manager = 14
responsible = 15
department = 16
date_target = 17
source = 18
electron_bre = 19
main_page = 20
is_general = 21
actualize_period = 22
age_restriction = 23
priority = 24
article_type = 25
date_exchange = 26
date_ees1 = 27
date_ex_tools = 28
date_ees2 = 29
expert = 30
contract = 31
comment = 32
task_id = 33
content_name_db = 34
task_name = 35
def to_field(self) -> FieldType:
'''Transform metadata column to FieldType'''
return _ContentFields[self.value]
def _get_task_name(content_name: str, is_immutable: bool) -> str:
UNMUTABLE_TEMPLATE = 'Неизменные {} (библиография+корректура+транскрипция)'
if not is_immutable:
return content_name
else:
return UNMUTABLE_TEMPLATE.format(content_name)
def _drop_from_nan(target: pandas.DataFrame) -> pandas.DataFrame:
rows_with_nan = [index for index, row in target.iterrows() if pandas.isna(row.iloc[0])]
if len(rows_with_nan) > 0:
return target[:rows_with_nan[0]]
else:
return target
class ContentIterator:
'''Iterates over metadata sheet rows'''
def __init__(self, data: pandas.DataFrame):
self._data = data
self._row = 0
self._count = len(self._data.index)
def __del__(self):
pass
def is_done(self) -> bool:
'''Indicates end of iteration'''
return self._row >= self._count
def next(self) -> bool:
'''Iteration'''
if self.is_done():
return False
self._row = self._row + 1
return True
def read_row(self) -> dict:
'''Data access'''
data = {}
for column in _ContentColumns:
if not pandas.isna(self._data.iat[self._row, column]):
field = column.to_field()
value = self._data.iat[self._row, column]
if field.input_method() == InputMethod.combo_dialog or \
field.input_method() == InputMethod.combo_dialog_simple_list:
data[field] = list(filter(None, [element.strip() for element in value.split(';')]))
elif isinstance(value, str):
data[field] = value.strip()
elif isinstance(value, pandas.Timestamp):
data[field] = value.strftime('%d.%m.%Y')
elif isinstance(value, datetime):
data[field] = value.strftime('%d.%m.%Y')
else:
data[field] = value
if FieldType.is_immutable in data:
data[FieldType.is_immutable] = data[FieldType.is_immutable] == 'Да'
if FieldType.electron_bre in data:
data[FieldType.electron_bre] = data[FieldType.electron_bre] == 'Да'
if FieldType.main_page in data:
data[FieldType.main_page] = data[FieldType.main_page] == 'Да'
if FieldType.is_general in data:
data[FieldType.is_general] = data[FieldType.is_general] == 'Да'
if FieldType.content_name_db not in data:
data[FieldType.content_name_db] = data[FieldType.content_name]
if FieldType.task_name not in data:
is_immutable = FieldType.is_immutable in data and data[FieldType.is_immutable]
data[FieldType.task_name] = _get_task_name(data[FieldType.content_name], is_immutable)
if FieldType.department in data:
data[FieldType.department] = 'Редакция ' + data[FieldType.department]
data[FieldType.editorial] = data[FieldType.department]
if FieldType.article_type in data:
data[FieldType.article_type][0] = data[FieldType.article_type][0] + ' статья'
return data
class DataReader:
'''BRE data reader for Excel'''
_SHEET_CONTENT = 'Контент'
_SHEET_ATTRIBUTES = 'Признаки'
def __init__(self):
self._xls = None
self._content = None
self._attributes = None
def load(self, input_file: str) -> bool:
'''Load file'''
try:
self._xls = pandas.ExcelFile(input_file)
self._content = _drop_from_nan(pandas.read_excel(self._xls, DataReader._SHEET_CONTENT))
self._attributes = _drop_from_nan(pandas.read_excel(self._xls, DataReader._SHEET_ATTRIBUTES))
except (FileNotFoundError, ValueError):
return False
return True
def get_content(self) -> ContentIterator:
'''Return iterator for cards'''
return ContentIterator(self._content)
def get_attributes_for(self, content_name) -> list:
'''Return attributes list for specific content'''
filtered = self._attributes.loc[self._attributes['Название контента'] == content_name]
return [(row[1], text_to_method(row[3]), row[2]) for index, row in filtered.iterrows()]