@inproceedings{Moon-KDD-22,
abstract = {We present GradMask, a simple adversarial example detection scheme for natural language processing (NLP) models. It uses gradient signals to detect adversarially perturbed tokens in an input sequence and occludes such tokens by a masking process. GradMask provides several advantages over existing methods including improved detection performance and an interpretation of its decision with a only moderate computational cost. Its approximated inference cost is no more than a single forward- and back-propagation through the target model without requiring any additional detection module. Extensive evaluation on widely adopted NLP benchmark datasets demonstrate the efficiency and effectiveness of GradMask.},
address = {Washington DC, USA},
author = {Han-Cheol Moon and Shafiq Joty and Xu Chi},
booktitle = {28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
publisher = {ACM},
series = {SIGKDD'22},
title = {GradMask: Gradient-Guided Token Masking for Textual Adversarial Example Detection},
url = {https://dl.acm.org/doi/abs/10.1145/3534678.3539206},
year = {2022}
}