rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. #_base_ = ['../../../_base_/default_runtime.py']
  2. _base_ = ['default_runtime.py']
  3. # runtime
  4. max_epochs = 270
  5. stage2_num_epochs = 30
  6. base_lr = 4e-3
  7. train_batch_size = 32
  8. val_batch_size = 32
  9. train_cfg = dict(max_epochs=max_epochs, val_interval=10)
  10. randomness = dict(seed=21)
  11. # optimizer
  12. optim_wrapper = dict(
  13. type='OptimWrapper',
  14. optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
  15. paramwise_cfg=dict(
  16. norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
  17. # learning rate
  18. param_scheduler = [
  19. dict(
  20. type='LinearLR',
  21. start_factor=1.0e-5,
  22. by_epoch=False,
  23. begin=0,
  24. end=1000),
  25. dict(
  26. # use cosine lr from 150 to 300 epoch
  27. type='CosineAnnealingLR',
  28. eta_min=base_lr * 0.05,
  29. begin=max_epochs // 2,
  30. end=max_epochs,
  31. T_max=max_epochs // 2,
  32. by_epoch=True,
  33. convert_to_iter_based=True),
  34. ]
  35. # automatically scaling LR based on the actual training batch size
  36. auto_scale_lr = dict(base_batch_size=512)
  37. # codec settings
  38. codec = dict(
  39. type='SimCCLabel',
  40. input_size=(288, 384),
  41. sigma=(6., 6.93),
  42. simcc_split_ratio=2.0,
  43. normalize=False,
  44. use_dark=False)
  45. # model settings
  46. model = dict(
  47. type='TopdownPoseEstimator',
  48. data_preprocessor=dict(
  49. type='PoseDataPreprocessor',
  50. mean=[123.675, 116.28, 103.53],
  51. std=[58.395, 57.12, 57.375],
  52. bgr_to_rgb=True),
  53. backbone=dict(
  54. _scope_='mmdet',
  55. type='CSPNeXt',
  56. arch='P5',
  57. expand_ratio=0.5,
  58. deepen_factor=1.,
  59. widen_factor=1.,
  60. out_indices=(4, ),
  61. channel_attention=True,
  62. norm_cfg=dict(type='SyncBN'),
  63. act_cfg=dict(type='SiLU'),
  64. init_cfg=dict(
  65. type='Pretrained',
  66. prefix='backbone.',
  67. checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
  68. 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501
  69. )),
  70. head=dict(
  71. type='RTMCCHead',
  72. in_channels=1024,
  73. out_channels=133,
  74. input_size=codec['input_size'],
  75. in_featuremap_size=(9, 12),
  76. simcc_split_ratio=codec['simcc_split_ratio'],
  77. final_layer_kernel_size=7,
  78. gau_cfg=dict(
  79. hidden_dims=256,
  80. s=128,
  81. expansion_factor=2,
  82. dropout_rate=0.,
  83. drop_path=0.,
  84. act_fn='SiLU',
  85. use_rel_bias=False,
  86. pos_enc=False),
  87. loss=dict(
  88. type='KLDiscretLoss',
  89. use_target_weight=True,
  90. beta=10.,
  91. label_softmax=True),
  92. decoder=codec),
  93. test_cfg=dict(flip_test=True, ))
  94. # base dataset settings
  95. dataset_type = 'UBody2dDataset'
  96. data_mode = 'topdown'
  97. data_root = 'data/UBody/'
  98. backend_args = dict(backend='local')
  99. scenes = [
  100. 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
  101. 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
  102. 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
  103. ]
  104. train_datasets = [
  105. dict(
  106. type='CocoWholeBodyDataset',
  107. data_root='data/coco/',
  108. data_mode=data_mode,
  109. ann_file='annotations/coco_wholebody_train_v1.0.json',
  110. data_prefix=dict(img='train2017/'),
  111. pipeline=[])
  112. ]
  113. for scene in scenes:
  114. train_dataset = dict(
  115. type=dataset_type,
  116. data_root=data_root,
  117. data_mode=data_mode,
  118. ann_file=f'annotations/{scene}/train_annotations.json',
  119. data_prefix=dict(img='images/'),
  120. pipeline=[],
  121. sample_interval=10)
  122. train_datasets.append(train_dataset)
  123. # pipelines
  124. train_pipeline = [
  125. dict(type='LoadImage', backend_args=backend_args),
  126. dict(type='GetBBoxCenterScale'),
  127. dict(type='RandomFlip', direction='horizontal'),
  128. dict(type='RandomHalfBody'),
  129. dict(
  130. type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
  131. dict(type='TopdownAffine', input_size=codec['input_size']),
  132. dict(type='mmdet.YOLOXHSVRandomAug'),
  133. dict(
  134. type='Albumentation',
  135. transforms=[
  136. dict(type='Blur', p=0.1),
  137. dict(type='MedianBlur', p=0.1),
  138. dict(
  139. type='CoarseDropout',
  140. max_holes=1,
  141. max_height=0.4,
  142. max_width=0.4,
  143. min_holes=1,
  144. min_height=0.2,
  145. min_width=0.2,
  146. p=1.0),
  147. ]),
  148. dict(type='GenerateTarget', encoder=codec),
  149. dict(type='PackPoseInputs')
  150. ]
  151. val_pipeline = [
  152. dict(type='LoadImage', backend_args=backend_args),
  153. dict(type='GetBBoxCenterScale'),
  154. dict(type='TopdownAffine', input_size=codec['input_size']),
  155. dict(type='PackPoseInputs')
  156. ]
  157. train_pipeline_stage2 = [
  158. dict(type='LoadImage', backend_args=backend_args),
  159. dict(type='GetBBoxCenterScale'),
  160. dict(type='RandomFlip', direction='horizontal'),
  161. dict(type='RandomHalfBody'),
  162. dict(
  163. type='RandomBBoxTransform',
  164. shift_factor=0.,
  165. scale_factor=[0.5, 1.5],
  166. rotate_factor=90),
  167. dict(type='TopdownAffine', input_size=codec['input_size']),
  168. dict(type='mmdet.YOLOXHSVRandomAug'),
  169. dict(
  170. type='Albumentation',
  171. transforms=[
  172. dict(type='Blur', p=0.1),
  173. dict(type='MedianBlur', p=0.1),
  174. dict(
  175. type='CoarseDropout',
  176. max_holes=1,
  177. max_height=0.4,
  178. max_width=0.4,
  179. min_holes=1,
  180. min_height=0.2,
  181. min_width=0.2,
  182. p=0.5),
  183. ]),
  184. dict(type='GenerateTarget', encoder=codec),
  185. dict(type='PackPoseInputs')
  186. ]
  187. # data loaders
  188. train_dataloader = dict(
  189. batch_size=train_batch_size,
  190. num_workers=10,
  191. persistent_workers=True,
  192. sampler=dict(type='DefaultSampler', shuffle=True),
  193. dataset=dict(
  194. type='CombinedDataset',
  195. metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
  196. datasets=train_datasets,
  197. pipeline=train_pipeline,
  198. test_mode=False,
  199. ))
  200. val_dataloader = dict(
  201. batch_size=val_batch_size,
  202. num_workers=10,
  203. persistent_workers=True,
  204. drop_last=False,
  205. sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
  206. dataset=dict(
  207. type='CocoWholeBodyDataset',
  208. data_root=data_root,
  209. data_mode=data_mode,
  210. ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
  211. bbox_file='data/coco/person_detection_results/'
  212. 'COCO_val2017_detections_AP_H_56_person.json',
  213. data_prefix=dict(img='coco/val2017/'),
  214. test_mode=True,
  215. pipeline=val_pipeline,
  216. ))
  217. test_dataloader = val_dataloader
  218. # hooks
  219. default_hooks = dict(
  220. checkpoint=dict(
  221. save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
  222. custom_hooks = [
  223. dict(
  224. type='EMAHook',
  225. ema_type='ExpMomentumEMA',
  226. momentum=0.0002,
  227. update_buffers=True,
  228. priority=49),
  229. dict(
  230. type='mmdet.PipelineSwitchHook',
  231. switch_epoch=max_epochs - stage2_num_epochs,
  232. switch_pipeline=train_pipeline_stage2)
  233. ]
  234. # evaluators
  235. val_evaluator = dict(
  236. type='CocoWholeBodyMetric',
  237. ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
  238. test_evaluator = val_evaluator