REGEXP to remove html tags and get the word count of a content
SELECT content_id, COUNT(words) wordcount
FROM (
SELECT content_id, SPLIT(tt, '') words
FROM (
SELECT content_id,
REGEXP_REPLACE(content, r'(<[^>]+>|\&(nbsp;)|(amp;)|&#\d\d\d\d)', '') tt,
FROM [project:dataset.table] ))
GROUP BY content_id