leetcode 力扣 393 UTF-8 编码验证 题解 算法题

发布时间:   来源:文档文库   
字号:
题目:UTF-8编码验证
给定一个表示数据的整数数组data,返回它是否为有效的UTF-8编码。UTF-8中的一个字符可能的长度为14字节,遵循以下的规则:
1.对于1字节的字符,字节的第一位设为0,后面7位为这个符号的unicode
码。
2.对于n字节的字符(n>1,第一个字节的前n位都设为1,第n+1位设为0
后面字节的前两位一律设为10。剩下的没有提及的二进制位,全部为这个符号的unicode码。这是UTF-8编码的工作方式:
Char.numberrange|UTF-8octetsequence(hexadecimal|(binary
--------------------+---------------------------------------------00000000-0000007F|0xxxxxxx
00000080-000007FF|110xxxxx10xxxxxx
00000800-0000FFFF|1110xxxx10xxxxxx10xxxxxx
00010000-0010FFFF|11110xxx10xxxxxx10xxxxxx10xxxxxx
注意:输入是整数数组。只有每个整数的最低8个有效位用来存储数据。这意味着每个整数只表示1字节的数据。示例1
输入:data=[197,130,1]输出:true
解释:数据表示字节序列:110001011000001000000001
这是有效的utf-8编码,为一个2字节字符,跟着一个1字节字符。
示例2
输入:data=[235,140,4]输出:false
解释:数据表示8位的序列:111010111000110000000100.3位都是1,第4位为0表示它是一个3字节字符。下一个字节是开头为10的延续字节,这是正确的。
但第二个延续字节不以10开头,所以是不符合规则的。
提示:

1<=data.length<=2*1040<=data[i]<=255
语言:Java
classSolution{
publicbooleanvalidUtf8(int[]data{
//NumberofbytesinthecurrentUTF-8characterintnumberOfBytesToProcess=0;
//Foreachintegerinthedataarray.for(inti=0;ilength;i++{
//Getthebinaryrepresentation.Weonlyneedtheleastsignificant8bits//foranygivennumber.
StringbinRep=Integer.toBinaryString(data[i];binRep=
binRep.length(>=8
?binRep.substring(binRep.length(-8
:"00000000".substring(binRep.length(%8+binRep;
//IfthisisthecasethenwearetostartprocessinganewUTF-8character.if(numberOfBytesToProcess==0{
//Getthenumberof1sinthebeginningofthestring.for(intj=0;jlength(;j++{if(binRep.charAt(j=='0'{break;}
numberOfBytesToProcess+=1;}
//1bytecharacters
if(numberOfBytesToProcess==0{continue;}
//Invalidscenariosaccordingtotherulesoftheproblem.
if(numberOfBytesToProcess>4||numberOfBytesToProcess==1{returnfalse;}
}else{
//Else,weareprocessingintegerswhichrepresentbyteswhichareapartof//aUTF-8character.So,theymustadheretothepattern`10xxxxxx`.if(!(binRep.charAt(0=='1'&&binRep.charAt(1=='0'{returnfalse;}}
//Wereducethenumberofbytestoprocessby1aftereachinteger.numberOfBytesToProcess-=1;

}
//Thisisforthecasewherewemightnothavethecompletedatafor//aparticularUTF-8character.
returnnumberOfBytesToProcess==0;}}
语言:Java
classSolution{
publicbooleanvalidUtf8(int[]data{
//NumberofbytesinthecurrentUTF-8characterintnumberOfBytesToProcess=0;
//Maskstochecktwomostsignificantbitsinabyte.intmask1=1<<7;intmask2=1<<6;
//Foreachintegerinthedataarray.for(inti=0;ilength;i++{
//IfthisisthecasethenwearetostartprocessinganewUTF-8character.if(numberOfBytesToProcess==0{intmask=1<<7;
while((mask&data[i]!=0{numberOfBytesToProcess+=1;mask=mask>>1;}
//1bytecharacters
if(numberOfBytesToProcess==0{continue;}
//Invalidscenariosaccordingtotherulesoftheproblem.
if(numberOfBytesToProcess>4||numberOfBytesToProcess==1{returnfalse;}
}else{
//data[i]shouldhavemostsignificantbitsetand
//secondmostsignificantbitunset.So,weusethetwomasks//tomakesurethisisthecase.
if(!((data[i]&mask1!=0&&(mask2&data[i]==0{returnfalse;}}
//Wereducethenumberofbytestoprocessby1aftereachinteger.numberOfBytesToProcess-=1;}
//Thisisforthecasewherewemightnothavethecompletedatafor

//aparticularUTF-8character.
returnnumberOfBytesToProcess==0;}}
语言:Python
classSolution:
defvalidUtf8(self,data:"""
:typedata:List[int]:rtype:bool"""
#NumberofbytesinthecurrentUTF-8charactern_bytes=0
#Foreachintegerinthedataarray.fornumindata:
#Getthebinaryrepresentation.Weonlyneedtheleastsignificant8bits#foranygivennumber.
bin_rep=format(num,'#010b'[-8:]
#IfthisisthecasethenwearetostartprocessinganewUTF-8character.ifn_bytes==0:
#Getthenumberof1sinthebeginningofthestring.forbitinbin_rep:ifbit=='0':breakn_bytes+=1
#1bytecharactersifn_bytes==0:continue
#Invalidscenariosaccordingtotherulesoftheproblem.ifn_bytes==1orn_bytes>4:returnFalseelse:
#Else,weareprocessingintegerswhichrepresentbyteswhichareapartof#aUTF-8character.So,theymustadheretothepattern`10xxxxxx`.ifnot(bin_rep[0]=='1'andbin_rep[1]=='0':returnFalse
#Wereducethenumberofbytestoprocessby1aftereachinteger.n_bytes-=1
#Thisisforthecasewherewemightnothavethecompletedatafor#aparticularUTF-8character.returnn_bytes==0
语言:Python
classSolution:
defvalidUtf8(self,data:

"""
:typedata:List[int]:rtype:bool"""
#NumberofbytesinthecurrentUTF-8charactern_bytes=0
#Masktocheckifthemostsignificantbit(8thbitfromtheleftissetornotmask1=1<<7
#Masktocheckifthesecondmostsignificantbitissetornotmask2=1<<6fornumindata:
#Getthenumberofsetmostsignificantbitsinthebyteif#thisisthestartingbyteofanUTF-8character.mask=1<<7ifn_bytes==0:
whilemask&num:n_bytes+=1
mask=mask>>1
#1bytecharactersifn_bytes==0:continue
#Invalidscenariosaccordingtotherulesoftheproblem.ifn_bytes==1orn_bytes>4:returnFalseelse:
#IfthisbyteisapartofanexistingUTF-8character,thenwe
#simplyhavetolookatthetwomostsignificantbitsandwemake#useofthemaskswedefinedbefore.
ifnot(num&mask1andnot(num&mask2:returnFalsen_bytes-=1
returnn_bytes==0
语言:cpp
classSolution{public:
boolvalidUtf8(vector<int>&data{intc=0;
for(constint&num:data{
if(c==0{
if((num>>5==0b110c=1;
elseif((num>>4==0b1110c=2;elseif((num>>3==0b11110c=3;elseif((num>>7returnfalse;}

else{
if((num>>6!=0b10returnfalse;--c;}}
returnc==0;}};
语言:cpp
classAutomation{
vectorint>>table={{0,4,1,2,3,4},{4,0,4,4,4,4},{4,1,4,4,4,4},{4,2,4,4,4,4},{4,4,4,4,4,4}};
vector<unsignedchar>mask={0b10000000,0b11000000,0b11100000,0b11110000,0b11111000};
vector<unsignedchar>type={0b00000000,0b10000000,0b11000000,0b11100000,0b11110000};
intgetInputType(unsignedcharc{for(inti=0;i<5;++i{
if((c&mask[i]==type[i]returni;}
return5;}
public:
intstate=0;
intinput(charc{
state=table[state][getInputType(c];returnstate;}};
classSolution{public:
boolvalidUtf8(vector<int>&data{Automationautomation;for(inti:data{
unsignedchard=(unsignedchari;if(automation.input(d==4returnfalse;}
returnautomation.state==0?true:false;}};

语言:php
classSolution{
/**
*@paramInteger[]$data*@returnBoolean*/
functionvalidUtf8($data{$len=count($data;
for($i=0;$i<$len;$i++{$num=$data[$i];
if(($num<192&&$num>127||$num>247{returnfalse;}
if($num>=240{
if($this->check4($data,$i,$len===false{returnfalse;}
$i=$i+3;continue;}
if($num>=224{
if($this->check3($data,$i,$len===false{returnfalse;}
$i=$i+2;continue;}
if($num>=192{
if($this->check2($data,$i,$len===false{returnfalse;}
$i=$i+1;continue;}}
returntrue;}
privatefunctioncheck4($data,$i,$len{if($len-$i<4{returnfalse;}
return$data[$i]>=240&&$this->check($data[$i+1]&&$this->check($data[$i+2]&&$this->check($data[$i+3];}
privatefunctioncheck3($data,$i,$len{if($len-$i<3{returnfalse;}
return$data[$i]>=224&&$this->check($data[$i+1]&&$this->check($data[$i+2];}

privatefunctioncheck2($data,$i,$len{if($len-$i<2{returnfalse;}
return$data[$i]>=192&&$this->check($data[$i+1];}
privatefunctioncheck($num{
return$num>=128&&$num<=191;}}
语言:golang

funcvalidUtf8(data[]intbool{

getCount:=func(bintint{count:=0switch{
caseb>>3==0b11110:count=4
caseb>>4==0b1110:count=3
caseb>>5==0b110:count=2
caseb>>7==0b0:count=1default:
count=-2}
returncount}

count:=0
for_,b:=rangedata{ifcount==0{
count=getCount(bifcount<0{returnfalse}}else{
if!(b>>6==0b10{returnfalse}}
count--}

returncount==0}

语言:golang

funcvalidUtf8(data[]intbool{iflen(data==0{returnfalse}
count:=0
fori:=0;i//计算字节数ifcount==0{
count=bytelength(data[i]//count>0判断字节是否合法}else{
//是否是10xxxxxx
ifdata[i]&(3<<6!=1<<7{returnfalse}
count--}
//实际字节数多余countifcount==-1{returnfalse}}
returncount==0

}
funcbytelength(dataintint{//0xxxxxxx
ifdata>>7==0{return0}
//110xxxxx10xxxxxxifdata>>5==6{

return1}
//1110xxxx10xxxxxx10xxxxxxifdata>>4==14{return2}
//11110xxx10xxxxxx10xxxxxx10xxxxxxifdata>>3==30{return3}
return-1}

语言:golang
funcvalidUtf8(data[]intbool{
//masks分别用于检测开头若干位,head用于检测是否10两位masks:=[]int{0xf8,0xf0,0xe0,0x80}head:=0xc0
fori:=0;ibytes:=4
num:=data[i]
for_,mask:=rangemasks[:]{
if(mask&num==(mask&(mask-1{break}
bytes--}
ifbytes<1||i+bytes>len(data{returnfalse}
ifbytes==1{continue}
forj:=1;jnum=data[i+j]
if(num&head!=(head&(head-1{returnfalse}}
i+=bytes-1}
returntrue}
语言:javascript
/**
*@param{number[]}data*@return{boolean}*/
varvalidUtf8=function(data{letmemo={};
data=data.map(e=>{
letbinaryStr=e.toString(2;if(binaryStr.length>8{
binaryStr=binaryStr.slice(0,8;returnbinaryStr;
}elseif(binaryStr.length<8{
binaryStr='0'.repeat(8-binaryStr.length+binaryStr;returnbinaryStr;}
returnbinaryStr;};
returndfs(data,0;

functiondfs(data,startIndx{
if(startIndx>=data.lengthreturntrue;
if(memo[startIndx]!==undefinedreturnmemo[startIndx];if(data[startIndx][0]==='0'{
constbool=dfs(data,startIndx+1;memo[startIndx]=bool;returnbool;}
if(data[startIndx].slice(0,3==='110'&&data[startIndx+1]&&data[startIndx+1].slice(0,2==='10'{
constbool=dfs(data,startIndx+2;memo[startIndx]=bool;returnbool;}
if(data[startIndx].slice(0,4==='1110'&&data[startIndx+1]&&data[startIndx+1].slice(0,2==='10'&&data[startIndx+2]&&data[startIndx+2].slice(0,2==='10'{constbool=dfs(data,startIndx+3;memo[startIndx]=bool;returnbool;}
if(data[startIndx].slice(0,5==='11110'&&data[startIndx+1]&&data[startIndx+1].slice(0,2==='10'&&data[startIndx+2]&&data[startIndx+2].slice(0,2==='10'&&data[startIndx+3]&&data[startIndx+3].slice(0,2==='10'{constbool=dfs(data,startIndx+4;memo[startIndx]=bool;returnbool;}
memo[startIndx]=false;returnfalse;}};

本文来源:https://www.2haoxitong.net/k/doc/17751530e1bd960590c69ec3d5bbfd0a7956d5e7.html

《leetcode 力扣 393 UTF-8 编码验证 题解 算法题.doc》
将本文的Word文档下载到电脑,方便收藏和打印
推荐度:
点击下载文档

文档为doc格式